LCOV - code coverage report
Current view: top level - src/torque - earley-parser.h (source / functions) Hit Total Coverage
Test: app.info Lines: 96 97 99.0 %
Date: 2019-04-17 Functions: 238 297 80.1 %

          Line data    Source code
       1             : // Copyright 2018 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #ifndef V8_TORQUE_EARLEY_PARSER_H_
       6             : #define V8_TORQUE_EARLEY_PARSER_H_
       7             : 
       8             : #include <map>
       9             : #include <vector>
      10             : 
      11             : #include "src/base/optional.h"
      12             : #include "src/torque/contextual.h"
      13             : #include "src/torque/source-positions.h"
      14             : #include "src/torque/utils.h"
      15             : 
      16             : namespace v8 {
      17             : namespace internal {
      18             : namespace torque {
      19             : 
      20             : class Symbol;
      21             : class Item;
      22             : 
      23             : class ParseResultHolderBase {
      24             :  public:
      25             :   enum class TypeId;
      26      130706 :   virtual ~ParseResultHolderBase() = default;
      27             :   template <class T>
      28             :   T& Cast();
      29             :   template <class T>
      30             :   const T& Cast() const;
      31             : 
      32             :  protected:
      33      130706 :   explicit ParseResultHolderBase(TypeId type_id) : type_id_(type_id) {
      34             :     // MSVC wrongly complains about type_id_ being an unused private field.
      35             :     USE(type_id_);
      36             :   }
      37             : 
      38             :  private:
      39             :   const TypeId type_id_;
      40             : };
      41             : 
      42             : enum class ParseResultHolderBase::TypeId {
      43             :   kStdString,
      44             :   kBool,
      45             :   kStdVectorOfString,
      46             :   kExpressionPtr,
      47             :   kIdentifierPtr,
      48             :   kOptionalIdentifierPtr,
      49             :   kStatementPtr,
      50             :   kDeclarationPtr,
      51             :   kTypeExpressionPtr,
      52             :   kOptionalTypeExpressionPtr,
      53             :   kLabelBlockPtr,
      54             :   kOptionalLabelBlockPtr,
      55             :   kNameAndTypeExpression,
      56             :   kNameAndExpression,
      57             :   kClassFieldExpression,
      58             :   kStructFieldExpression,
      59             :   kStdVectorOfNameAndTypeExpression,
      60             :   kStdVectorOfNameAndExpression,
      61             :   kStdVectorOfClassFieldExpression,
      62             :   kStdVectorOfStructFieldExpression,
      63             :   kIncrementDecrementOperator,
      64             :   kOptionalStdString,
      65             :   kStdVectorOfStatementPtr,
      66             :   kStdVectorOfDeclarationPtr,
      67             :   kStdVectorOfExpressionPtr,
      68             :   kExpressionWithSource,
      69             :   kParameterList,
      70             :   kRangeExpression,
      71             :   kOptionalRangeExpression,
      72             :   kTypeList,
      73             :   kOptionalTypeList,
      74             :   kLabelAndTypes,
      75             :   kStdVectorOfLabelAndTypes,
      76             :   kStdVectorOfLabelBlockPtr,
      77             :   kOptionalStatementPtr,
      78             :   kOptionalExpressionPtr,
      79             :   kTypeswitchCase,
      80             :   kStdVectorOfTypeswitchCase,
      81             :   kStdVectorOfIdentifierPtr,
      82             : 
      83             :   kJsonValue,
      84             :   kJsonMember,
      85             :   kStdVectorOfJsonValue,
      86             :   kStdVectorOfJsonMember,
      87             : };
      88             : 
      89             : using ParseResultTypeId = ParseResultHolderBase::TypeId;
      90             : 
      91             : template <class T>
      92      261412 : class ParseResultHolder : public ParseResultHolderBase {
      93             :  public:
      94         171 :   explicit ParseResultHolder(T value)
      95      130937 :       : ParseResultHolderBase(id), value_(std::move(value)) {}
      96             : 
      97             :  private:
      98             :   V8_EXPORT_PRIVATE static const TypeId id;
      99             :   friend class ParseResultHolderBase;
     100             :   T value_;
     101             : };
     102             : 
     103             : template <class T>
     104             : T& ParseResultHolderBase::Cast() {
     105      130706 :   CHECK_EQ(ParseResultHolder<T>::id, type_id_);
     106        1618 :   return static_cast<ParseResultHolder<T>*>(this)->value_;
     107             : }
     108             : 
     109             : template <class T>
     110             : const T& ParseResultHolderBase::Cast() const {
     111             :   CHECK_EQ(ParseResultHolder<T>::id, type_id_);
     112             :   return static_cast<const ParseResultHolder<T>*>(this)->value_;
     113             : }
     114             : 
     115      917713 : class ParseResult {
     116             :  public:
     117             :   template <class T>
     118      283877 :   explicit ParseResult(T x) : value_(new ParseResultHolder<T>(std::move(x))) {}
     119             : 
     120             :   template <class T>
     121             :   const T& Cast() const& {
     122             :     return value_->Cast<T>();
     123             :   }
     124             :   template <class T>
     125             :   T& Cast() & {
     126             :     return value_->Cast<T>();
     127             :   }
     128             :   template <class T>
     129             :   T&& Cast() && {
     130             :     return std::move(value_->Cast<T>());
     131             :   }
     132             : 
     133             :  private:
     134             :   std::unique_ptr<ParseResultHolderBase> value_;
     135             : };
     136             : 
     137             : using InputPosition = const char*;
     138             : 
     139             : struct MatchedInput {
     140             :   MatchedInput(InputPosition begin, InputPosition end, SourcePosition pos)
     141      312665 :       : begin(begin), end(end), pos(pos) {}
     142             :   InputPosition begin;
     143             :   InputPosition end;
     144             :   SourcePosition pos;
     145             :   std::string ToString() const { return {begin, end}; }
     146             : };
     147             : 
     148             : class ParseResultIterator {
     149             :  public:
     150             :   explicit ParseResultIterator(std::vector<ParseResult> results,
     151             :                                MatchedInput matched_input)
     152      252841 :       : results_(std::move(results)), matched_input_(matched_input) {}
     153      505682 :   ~ParseResultIterator() {
     154             :     // Check that all parse results have been used.
     155      505682 :     CHECK_EQ(results_.size(), i_);
     156      252841 :   }
     157             : 
     158      251803 :   ParseResult Next() {
     159      251803 :     CHECK_LT(i_, results_.size());
     160      503606 :     return std::move(results_[i_++]);
     161             :   }
     162             :   template <class T>
     163      130692 :   T NextAs() {
     164      318002 :     return std::move(Next().Cast<T>());
     165             :   }
     166      122951 :   bool HasNext() const { return i_ < results_.size(); }
     167             : 
     168             :   const MatchedInput& matched_input() const { return matched_input_; }
     169             : 
     170             :  private:
     171             :   std::vector<ParseResult> results_;
     172             :   size_t i_ = 0;
     173             :   MatchedInput matched_input_;
     174             : 
     175             :   DISALLOW_COPY_AND_ASSIGN(ParseResultIterator);
     176             : };
     177             : 
     178         140 : struct LexerResult {
     179             :   std::vector<Symbol*> token_symbols;
     180             :   std::vector<MatchedInput> token_contents;
     181             : };
     182             : 
     183             : using Action =
     184             :     base::Optional<ParseResult> (*)(ParseResultIterator* child_results);
     185             : 
     186      121169 : inline base::Optional<ParseResult> DefaultAction(
     187             :     ParseResultIterator* child_results) {
     188      121169 :   if (!child_results->HasNext()) return base::nullopt;
     189      242222 :   return child_results->Next();
     190             : }
     191             : 
     192             : // A rule of the context-free grammar. Each rule can have an action attached to
     193             : // it, which is executed after the parsing is finished.
     194       59763 : class Rule final {
     195             :  public:
     196             :   explicit Rule(std::vector<Symbol*> right_hand_side,
     197             :                 Action action = DefaultAction)
     198       35980 :       : right_hand_side_(std::move(right_hand_side)), action_(action) {}
     199             : 
     200             :   Symbol* left() const {
     201             :     DCHECK_NOT_NULL(left_hand_side_);
     202             :     return left_hand_side_;
     203             :   }
     204             :   const std::vector<Symbol*>& right() const { return right_hand_side_; }
     205             : 
     206             :   void SetLeftHandSide(Symbol* left_hand_side) {
     207             :     DCHECK_NULL(left_hand_side_);
     208       19921 :     left_hand_side_ = left_hand_side;
     209             :   }
     210             : 
     211             :   V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
     212             :       const Item* completed_item, const LexerResult& tokens) const;
     213             : 
     214             :  private:
     215             :   Symbol* left_hand_side_ = nullptr;
     216             :   std::vector<Symbol*> right_hand_side_;
     217             :   Action action_;
     218             : };
     219             : 
     220             : // A Symbol represents a terminal or a non-terminal of the grammar.
     221             : // It stores the list of rules, which have this symbol as the
     222             : // left-hand side.
     223             : // Terminals have an empty list of rules, they are created by the Lexer
     224             : // instead of from rules.
     225             : // Symbols need to reside at stable memory addresses, because the addresses are
     226             : // used in the parser.
     227       12518 : class Symbol {
     228             :  public:
     229        5905 :   Symbol() : Symbol({}) {}
     230       30104 :   Symbol(std::initializer_list<Rule> rules) { *this = rules; }
     231             : 
     232             :   V8_EXPORT_PRIVATE Symbol& operator=(std::initializer_list<Rule> rules);
     233             : 
     234             :   bool IsTerminal() const { return rules_.empty(); }
     235             :   Rule* rule(size_t index) const { return rules_[index].get(); }
     236             :   size_t rule_number() const { return rules_.size(); }
     237             : 
     238       19921 :   void AddRule(const Rule& rule) {
     239       39842 :     rules_.push_back(base::make_unique<Rule>(rule));
     240             :     rules_.back()->SetLeftHandSide(this);
     241       19921 :   }
     242             : 
     243             :   V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
     244             :       const Item* item, const LexerResult& tokens);
     245             : 
     246             :  private:
     247             :   std::vector<std::unique_ptr<Rule>> rules_;
     248             : 
     249             :   // Disallow copying and moving to ensure Symbol has a stable address.
     250             :   DISALLOW_COPY_AND_ASSIGN(Symbol);
     251             : };
     252             : 
     253             : // Items are the core datastructure of Earley's algorithm.
     254             : // They consist of a (partially) matched rule, a marked position inside of the
     255             : // right-hand side of the rule (traditionally written as a dot) and an input
     256             : // range from {start} to {pos} that matches the symbols of the right-hand side
     257             : // that are left of the mark. In addition, they store a child and a left-sibling
     258             : // pointer to reconstruct the AST in the end.
     259             : class Item {
     260             :  public:
     261             :   Item(const Rule* rule, size_t mark, size_t start, size_t pos)
     262     6923193 :       : rule_(rule), mark_(mark), start_(start), pos_(pos) {
     263             :     DCHECK_LE(mark_, right().size());
     264             :   }
     265             : 
     266             :   // A complete item has the mark at the right end, which means the input range
     267             :   // matches the complete rule.
     268             :   bool IsComplete() const {
     269             :     DCHECK_LE(mark_, right().size());
     270     2747911 :     return mark_ == right().size();
     271             :   }
     272             : 
     273             :   // The symbol right after the mark is expected at {pos} for this item to
     274             :   // advance.
     275             :   Symbol* NextSymbol() const {
     276             :     DCHECK(!IsComplete());
     277             :     DCHECK_LT(mark_, right().size());
     278     2322868 :     return right()[mark_];
     279             :   }
     280             : 
     281             :   // We successfully parsed NextSymbol() between {pos} and {new_pos}.
     282             :   // If NextSymbol() was a non-terminal, then {child} is a pointer to a
     283             :   // completed item for this parse.
     284             :   // We create a new item, which moves the mark one forward.
     285             :   Item Advance(size_t new_pos, const Item* child = nullptr) const {
     286             :     if (child) {
     287             :       DCHECK(child->IsComplete());
     288             :       DCHECK_EQ(pos(), child->start());
     289             :       DCHECK_EQ(new_pos, child->pos());
     290             :       DCHECK_EQ(NextSymbol(), child->left());
     291             :     }
     292      764809 :     Item result(rule_, mark_ + 1, start_, new_pos);
     293      764809 :     result.prev_ = this;
     294      696640 :     result.child_ = child;
     295             :     return result;
     296             :   }
     297             : 
     298             :   // Collect the items representing the AST children of this completed item.
     299             :   std::vector<const Item*> Children() const;
     300             :   // The matched input separated according to the next branching AST level.
     301             :   std::string SplitByChildren(const LexerResult& tokens) const;
     302             :   // Check if {other} results in the same AST as this Item.
     303             :   void CheckAmbiguity(const Item& other, const LexerResult& tokens) const;
     304             : 
     305             :   MatchedInput GetMatchedInput(const LexerResult& tokens) const {
     306      252841 :     return {tokens.token_contents[start_].begin,
     307             :             start_ == pos_ ? tokens.token_contents[start_].begin
     308      214669 :                            : tokens.token_contents[pos_ - 1].end,
     309      467510 :             tokens.token_contents[start_].pos};
     310             :   }
     311             : 
     312             :   // We exclude {prev_} and {child_} from equality and hash computations,
     313             :   // because they are just globally unique data associated with an item.
     314             :   bool operator==(const Item& other) const {
     315     3291567 :     return rule_ == other.rule_ && mark_ == other.mark_ &&
     316     3291567 :            start_ == other.start_ && pos_ == other.pos_;
     317             :   }
     318             : 
     319     6923193 :   friend size_t hash_value(const Item& i) {
     320     6923193 :     return base::hash_combine(i.rule_, i.mark_, i.start_, i.pos_);
     321             :   }
     322             : 
     323      252773 :   const Rule* rule() const { return rule_; }
     324             :   Symbol* left() const { return rule_->left(); }
     325             :   const std::vector<Symbol*>& right() const { return rule_->right(); }
     326             :   size_t pos() const { return pos_; }
     327             :   size_t start() const { return start_; }
     328             : 
     329             :  private:
     330             :   const Rule* rule_;
     331             :   size_t mark_;
     332             :   size_t start_;
     333             :   size_t pos_;
     334             : 
     335             :   const Item* prev_ = nullptr;
     336             :   const Item* child_ = nullptr;
     337             : };
     338             : 
     339             : inline base::Optional<ParseResult> Symbol::RunAction(
     340             :     const Item* item, const LexerResult& tokens) {
     341             :   DCHECK(item->IsComplete());
     342             :   DCHECK_EQ(item->left(), this);
     343      252841 :   return item->rule()->RunAction(item, tokens);
     344             : }
     345             : 
     346             : V8_EXPORT_PRIVATE const Item* RunEarleyAlgorithm(
     347             :     Symbol* start, const LexerResult& tokens,
     348             :     std::unordered_set<Item, base::hash<Item>>* processed);
     349             : 
     350          69 : inline base::Optional<ParseResult> ParseTokens(Symbol* start,
     351             :                                                const LexerResult& tokens) {
     352             :   std::unordered_set<Item, base::hash<Item>> table;
     353          69 :   const Item* final_item = RunEarleyAlgorithm(start, tokens, &table);
     354          68 :   return start->RunAction(final_item, tokens);
     355             : }
     356             : 
     357             : // The lexical syntax is dynamically defined while building the grammar by
     358             : // adding patterns and keywords to the Lexer.
     359             : // The term keyword here can stand for any fixed character sequence, including
     360             : // operators and parentheses.
     361             : // Each pattern or keyword automatically gets a terminal symbol associated with
     362             : // it. These symbols form the result of the lexing.
     363             : // Patterns and keywords are matched using the longest match principle. If the
     364             : // longest matching pattern coincides with a keyword, the keyword symbol is
     365             : // chosen instead of the pattern.
     366             : // In addition, there is a single whitespace pattern which is consumed but does
     367             : // not become part of the token list.
     368         207 : class Lexer {
     369             :  public:
     370             :   // Functions to define patterns. They try to match starting from {pos}. If
     371             :   // successful, they return true and advance {pos}. Otherwise, {pos} stays
     372             :   // unchanged.
     373             :   using PatternFunction = bool (*)(InputPosition* pos);
     374             : 
     375             :   void SetWhitespace(PatternFunction whitespace) {
     376          69 :     match_whitespace_ = whitespace;
     377             :   }
     378             : 
     379         313 :   Symbol* Pattern(PatternFunction pattern) { return &patterns_[pattern]; }
     380       12349 :   Symbol* Token(const std::string& keyword) { return &keywords_[keyword]; }
     381             :   V8_EXPORT_PRIVATE LexerResult RunLexer(const std::string& input);
     382             : 
     383             :  private:
     384           0 :   PatternFunction match_whitespace_ = [](InputPosition*) { return false; };
     385             :   std::map<PatternFunction, Symbol> patterns_;
     386             :   std::map<std::string, Symbol> keywords_;
     387             :   Symbol* MatchToken(InputPosition* pos, InputPosition end);
     388             : };
     389             : 
     390             : // A grammar can have a result, which is the results of the start symbol.
     391             : // Grammar is intended to be subclassed, with Symbol members forming the
     392             : // mutually recursive rules of the grammar.
     393          69 : class Grammar {
     394             :  public:
     395             :   using PatternFunction = Lexer::PatternFunction;
     396             : 
     397          69 :   explicit Grammar(Symbol* start) : start_(start) {}
     398             : 
     399          70 :   base::Optional<ParseResult> Parse(const std::string& input) {
     400         139 :     LexerResult tokens = lexer().RunLexer(input);
     401         137 :     return ParseTokens(start_, tokens);
     402             :   }
     403             : 
     404             :  protected:
     405             :   Symbol* Token(const std::string& s) { return lexer_.Token(s); }
     406             :   Symbol* Pattern(PatternFunction pattern) { return lexer_.Pattern(pattern); }
     407             :   void SetWhitespace(PatternFunction ws) { lexer_.SetWhitespace(ws); }
     408             : 
     409             :   // NewSymbol() allocates a fresh symbol and stores it in the current grammar.
     410             :   // This is necessary to define helpers that create new symbols.
     411        6914 :   Symbol* NewSymbol(std::initializer_list<Rule> rules = {}) {
     412        6914 :     Symbol* result = new Symbol(rules);
     413       13828 :     generated_symbols_.push_back(std::unique_ptr<Symbol>(result));
     414        6914 :     return result;
     415             :   }
     416             : 
     417             :   // Helper functions to define lexer patterns. If they match, they return true
     418             :   // and advance {pos}. Otherwise, {pos} is unchanged.
     419             :   V8_EXPORT_PRIVATE static bool MatchChar(int (*char_class)(int),
     420             :                                           InputPosition* pos);
     421             :   V8_EXPORT_PRIVATE static bool MatchChar(bool (*char_class)(char),
     422             :                                           InputPosition* pos);
     423             :   V8_EXPORT_PRIVATE static bool MatchAnyChar(InputPosition* pos);
     424             :   V8_EXPORT_PRIVATE static bool MatchString(const char* s, InputPosition* pos);
     425             : 
     426             :   // The action MatchInput() produces the input matched by the rule as
     427             :   // result.
     428       24868 :   static base::Optional<ParseResult> YieldMatchedInput(
     429             :       ParseResultIterator* child_results) {
     430       74604 :     return ParseResult{child_results->matched_input().ToString()};
     431             :   }
     432             : 
     433             :   // Create a new symbol to parse the given sequence of symbols.
     434             :   // At most one of the symbols can return a result.
     435        1134 :   Symbol* Sequence(std::vector<Symbol*> symbols) {
     436        2268 :     return NewSymbol({Rule(std::move(symbols))});
     437             :   }
     438             : 
     439             :   template <class T, T value>
     440        9994 :   static base::Optional<ParseResult> YieldIntegralConstant(
     441             :       ParseResultIterator* child_results) {
     442        9994 :     return ParseResult{value};
     443             :   }
     444             : 
     445             :   template <class T>
     446       29492 :   static base::Optional<ParseResult> YieldDefaultValue(
     447             :       ParseResultIterator* child_results) {
     448       72822 :     return ParseResult{T{}};
     449             :   }
     450             : 
     451             :   template <class From, class To>
     452        9828 :   static base::Optional<ParseResult> CastParseResult(
     453             :       ParseResultIterator* child_results) {
     454       17749 :     To result = std::move(child_results->NextAs<From>());
     455       27658 :     return ParseResult{std::move(result)};
     456             :   }
     457             : 
     458             :   // Try to parse {s} and return the result of type {Result} casted to {T}.
     459             :   // Otherwise, the result is a default-constructed {T}.
     460             :   template <class T, class Result = T>
     461        2782 :   Symbol* TryOrDefault(Symbol* s) {
     462       19474 :     return NewSymbol({Rule({s}, CastParseResult<Result, T>),
     463        5564 :                       Rule({}, YieldDefaultValue<T>)});
     464             :   }
     465             : 
     466             :   template <class T>
     467        6240 :   static base::Optional<ParseResult> MakeSingletonVector(
     468             :       ParseResultIterator* child_results) {
     469        6395 :     T x = child_results->NextAs<T>();
     470         374 :     std::vector<T> result;
     471             :     result.push_back(std::move(x));
     472       24586 :     return ParseResult{std::move(result)};
     473             :   }
     474             : 
     475             :   template <class T>
     476        6063 :   static base::Optional<ParseResult> MakeExtendedVector(
     477             :       ParseResultIterator* child_results) {
     478        6348 :     std::vector<T> l = child_results->NextAs<std::vector<T>>();
     479        6090 :     T x = child_results->NextAs<T>();
     480             :     l.push_back(std::move(x));
     481       23967 :     return ParseResult{std::move(l)};
     482             :   }
     483             : 
     484             :   // For example, NonemptyList(Token("A"), Token(",")) parses any of
     485             :   // A or A,A or A,A,A and so on.
     486             :   template <class T>
     487        1108 :   Symbol* NonemptyList(Symbol* element,
     488             :                        base::Optional<Symbol*> separator = {}) {
     489        1108 :     Symbol* list = NewSymbol();
     490        7242 :     *list = {Rule({element}, MakeSingletonVector<T>),
     491             :              separator
     492             :                  ? Rule({list, *separator, element}, MakeExtendedVector<T>)
     493             :                  : Rule({list, element}, MakeExtendedVector<T>)};
     494        1108 :     return list;
     495             :   }
     496             : 
     497             :   template <class T>
     498             :   Symbol* List(Symbol* element, base::Optional<Symbol*> separator = {}) {
     499         892 :     return TryOrDefault<std::vector<T>>(NonemptyList<T>(element, separator));
     500             :   }
     501             : 
     502             :   template <class T>
     503             :   Symbol* Optional(Symbol* x) {
     504        1188 :     return TryOrDefault<base::Optional<T>, T>(x);
     505             :   }
     506             : 
     507         972 :   Symbol* CheckIf(Symbol* x) {
     508        6804 :     return NewSymbol({Rule({x}, YieldIntegralConstant<bool, true>),
     509        1944 :                       Rule({}, YieldIntegralConstant<bool, false>)});
     510             :   }
     511             : 
     512          70 :   Lexer& lexer() { return lexer_; }
     513             : 
     514             :  private:
     515             :   Lexer lexer_;
     516             :   std::vector<std::unique_ptr<Symbol>> generated_symbols_;
     517             :   Symbol* start_;
     518             : };
     519             : 
     520             : }  // namespace torque
     521             : }  // namespace internal
     522             : }  // namespace v8
     523             : 
     524             : #endif  // V8_TORQUE_EARLEY_PARSER_H_

Generated by: LCOV version 1.10