LCOV - code coverage report
Current view: top level - src/torque - earley-parser.h (source / functions) Hit Total Coverage
Test: app.info Lines: 102 103 99.0 %
Date: 2019-01-20 Functions: 219 261 83.9 %

          Line data    Source code
       1             : // Copyright 2018 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #ifndef V8_TORQUE_EARLEY_PARSER_H_
       6             : #define V8_TORQUE_EARLEY_PARSER_H_
       7             : 
       8             : #include <map>
       9             : #include <vector>
      10             : 
      11             : #include "src/base/optional.h"
      12             : #include "src/torque/contextual.h"
      13             : #include "src/torque/source-positions.h"
      14             : #include "src/torque/utils.h"
      15             : 
      16             : namespace v8 {
      17             : namespace internal {
      18             : namespace torque {
      19             : 
      20             : class Symbol;
      21             : class Item;
      22             : 
      23             : class ParseResultHolderBase {
      24             :  public:
      25             :   enum class TypeId;
      26       89979 :   virtual ~ParseResultHolderBase() = default;
      27             :   template <class T>
      28             :   T& Cast();
      29             :   template <class T>
      30             :   const T& Cast() const;
      31             : 
      32             :  protected:
      33       89979 :   explicit ParseResultHolderBase(TypeId type_id) : type_id_(type_id) {
      34             :     // MSVC wrongly complains about type_id_ being an unused private field.
      35             :     USE(type_id_);
      36             :   }
      37             : 
      38             :  private:
      39             :   const TypeId type_id_;
      40             : };
      41             : 
      42             : using ParseResultTypeId = ParseResultHolderBase::TypeId;
      43             : 
      44             : template <class T>
      45      179958 : class ParseResultHolder : public ParseResultHolderBase {
      46             :  public:
      47           3 :   explicit ParseResultHolder(T value)
      48       90006 :       : ParseResultHolderBase(id), value_(std::move(value)) {}
      49             : 
      50             :  private:
      51             :   V8_EXPORT_PRIVATE static const TypeId id;
      52             :   friend class ParseResultHolderBase;
      53             :   T value_;
      54             : };
      55             : 
      56             : template <class T>
      57       89979 : T& ParseResultHolderBase::Cast() {
      58       89979 :   CHECK_EQ(ParseResultHolder<T>::id, type_id_);
      59       89979 :   return static_cast<ParseResultHolder<T>*>(this)->value_;
      60             : }
      61             : 
      62             : template <class T>
      63             : const T& ParseResultHolderBase::Cast() const {
      64             :   CHECK_EQ(ParseResultHolder<T>::id, type_id_);
      65             :   return static_cast<const ParseResultHolder<T>*>(this)->value_;
      66             : }
      67             : 
      68             : class ParseResult {
      69             :  public:
      70             :   template <class T>
      71      204884 :   explicit ParseResult(T x) : value_(new ParseResultHolder<T>(std::move(x))) {}
      72             : 
      73             :   template <class T>
      74             :   const T& Cast() const {
      75             :     return value_->Cast<T>();
      76             :   }
      77             :   template <class T>
      78             :   T& Cast() {
      79       89979 :     return value_->Cast<T>();
      80             :   }
      81             : 
      82             :  private:
      83             :   std::unique_ptr<ParseResultHolderBase> value_;
      84             : };
      85             : 
      86             : using InputPosition = const char*;
      87             : 
      88             : struct MatchedInput {
      89             :   MatchedInput(InputPosition begin, InputPosition end, SourcePosition pos)
      90      222331 :       : begin(begin), end(end), pos(pos) {}
      91             :   InputPosition begin;
      92             :   InputPosition end;
      93             :   SourcePosition pos;
      94       18069 :   std::string ToString() const { return {begin, end}; }
      95             : };
      96             : 
      97             : class ParseResultIterator {
      98             :  public:
      99             :   explicit ParseResultIterator(std::vector<ParseResult> results,
     100             :                                MatchedInput matched_input)
     101      179870 :       : results_(std::move(results)), matched_input_(matched_input) {}
     102      359740 :   ~ParseResultIterator() {
     103             :     // Check that all parse results have been used.
     104      359740 :     CHECK_EQ(results_.size(), i_);
     105      179870 :   }
     106             : 
     107      179175 :   ParseResult Next() {
     108      358350 :     CHECK_LT(i_, results_.size());
     109      358350 :     return std::move(results_[i_++]);
     110             :   }
     111             :   template <class T>
     112       89977 :   T NextAs() {
     113      212937 :     return std::move(Next().Cast<T>());
     114             :   }
     115       90464 :   bool HasNext() const { return i_ < results_.size(); }
     116             : 
     117             :   const MatchedInput& matched_input() const { return matched_input_; }
     118             : 
     119             :  private:
     120             :   std::vector<ParseResult> results_;
     121             :   size_t i_ = 0;
     122             :   MatchedInput matched_input_;
     123             : 
     124             :   DISALLOW_COPY_AND_ASSIGN(ParseResultIterator);
     125             : };
     126             : 
     127          50 : struct LexerResult {
     128             :   std::vector<Symbol*> token_symbols;
     129             :   std::vector<MatchedInput> token_contents;
     130             : };
     131             : 
     132             : using Action =
     133             :     base::Optional<ParseResult> (*)(ParseResultIterator* child_results);
     134             : 
     135       89221 : inline base::Optional<ParseResult> DefaultAction(
     136             :     ParseResultIterator* child_results) {
     137       89221 :   if (!child_results->HasNext()) return base::nullopt;
     138      178396 :   return child_results->Next();
     139             : }
     140             : 
     141             : // A rule of the context-free grammar. Each rule can have an action attached to
     142             : // it, which is executed after the parsing is finished.
     143        8106 : class Rule final {
     144             :  public:
     145             :   explicit Rule(std::vector<Symbol*> right_hand_side,
     146             :                 Action action = DefaultAction)
     147       14671 :       : right_hand_side_(std::move(right_hand_side)), action_(action) {}
     148             : 
     149             :   Symbol* left() const {
     150             :     DCHECK_NOT_NULL(left_hand_side_);
     151             :     return left_hand_side_;
     152             :   }
     153             :   const std::vector<Symbol*>& right() const { return right_hand_side_; }
     154             : 
     155             :   void SetLeftHandSide(Symbol* left_hand_side) {
     156             :     DCHECK_NULL(left_hand_side_);
     157        8106 :     left_hand_side_ = left_hand_side;
     158             :   }
     159             : 
     160             :   V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
     161             :       const Item* completed_item, const LexerResult& tokens) const;
     162             : 
     163             :  private:
     164             :   Symbol* left_hand_side_ = nullptr;
     165             :   std::vector<Symbol*> right_hand_side_;
     166             :   Action action_;
     167             : };
     168             : 
     169             : // A Symbol represents a terminal or a non-terminal of the grammar.
     170             : // It stores the list of rules, which have this symbol as the
     171             : // left-hand side.
     172             : // Terminals have an empty list of rules, they are created by the Lexer
     173             : // instead of from rules.
     174             : // Symbols need to reside at stable memory addresses, because the addresses are
     175             : // used in the parser.
     176        5161 : class Symbol {
     177             :  public:
     178        4524 :   Symbol() : Symbol({}) {}
     179        6245 :   Symbol(std::initializer_list<Rule> rules) { *this = rules; }
     180             : 
     181             :   V8_EXPORT_PRIVATE Symbol& operator=(std::initializer_list<Rule> rules);
     182             : 
     183             :   bool IsTerminal() const { return rules_.empty(); }
     184          50 :   Rule* rule(size_t index) const { return rules_[index].get(); }
     185     3530469 :   size_t rule_number() const { return rules_.size(); }
     186             : 
     187        8106 :   void AddRule(const Rule& rule) {
     188       16212 :     rules_.push_back(base::make_unique<Rule>(rule));
     189             :     rules_.back()->SetLeftHandSide(this);
     190        8106 :   }
     191             : 
     192             :   V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
     193             :       const Item* item, const LexerResult& tokens);
     194             : 
     195             :  private:
     196             :   std::vector<std::unique_ptr<Rule>> rules_;
     197             : 
     198             :   // Disallow copying and moving to ensure Symbol has a stable address.
     199             :   DISALLOW_COPY_AND_ASSIGN(Symbol);
     200             : };
     201             : 
     202             : // Items are the core datastructure of Earley's algorithm.
     203             : // They consist of a (partially) matched rule, a marked position inside of the
     204             : // right-hand side of the rule (traditionally written as a dot) and an input
     205             : // range from {start} to {pos} that matches the symbols of the right-hand side
     206             : // that are left of the mark. In addition, they store a child and a left-sibling
     207             : // pointer to reconstruct the AST in the end.
     208             : class Item {
     209             :  public:
     210             :   Item(const Rule* rule, size_t mark, size_t start, size_t pos)
     211     4527423 :       : rule_(rule), mark_(mark), start_(start), pos_(pos) {
     212             :     DCHECK_LE(mark_, right().size());
     213             :   }
     214             : 
     215             :   // A complete item has the mark at the right end, which means the input range
     216             :   // matches the complete rule.
     217     1808258 :   bool IsComplete() const {
     218             :     DCHECK_LE(mark_, right().size());
     219     3616516 :     return mark_ == right().size();
     220             :   }
     221             : 
     222             :   // The symbol right after the mark is expected at {pos} for this item to
     223             :   // advance.
     224             :   Symbol* NextSymbol() const {
     225             :     DCHECK(!IsComplete());
     226             :     DCHECK_LT(mark_, right().size());
     227     1526815 :     return right()[mark_];
     228             :   }
     229             : 
     230             :   // We successfully parsed NextSymbol() between {pos} and {new_pos}.
     231             :   // If NextSymbol() was a non-terminal, then {child} is a pointer to a
     232             :   // completed item for this parse.
     233             :   // We create a new item, which moves the mark one forward.
     234             :   Item Advance(size_t new_pos, const Item* child = nullptr) const {
     235             :     if (child) {
     236             :       DCHECK(child->IsComplete());
     237             :       DCHECK_EQ(pos(), child->start());
     238             :       DCHECK_EQ(new_pos, child->pos());
     239             :       DCHECK_EQ(NextSymbol(), child->left());
     240             :     }
     241      520065 :     Item result(rule_, mark_ + 1, start_, new_pos);
     242      520722 :     result.prev_ = this;
     243      472418 :     result.child_ = child;
     244             :     return result;
     245             :   }
     246             : 
     247             :   // Collect the items representing the AST children of this completed item.
     248             :   std::vector<const Item*> Children() const;
     249             :   // The matched input separated according to the next branching AST level.
     250             :   std::string SplitByChildren(const LexerResult& tokens) const;
     251             :   // Check if {other} results in the same AST as this Item.
     252             :   void CheckAmbiguity(const Item& other, const LexerResult& tokens) const;
     253             : 
     254             :   MatchedInput GetMatchedInput(const LexerResult& tokens) const {
     255             :     return {tokens.token_contents[start_].begin,
     256             :             start_ == pos_ ? tokens.token_contents[start_].begin
     257      152082 :                            : tokens.token_contents[pos_ - 1].end,
     258      511822 :             tokens.token_contents[start_].pos};
     259             :   }
     260             : 
     261             :   // We exclude {prev_} and {child_} from equality and hash computations,
     262             :   // because they are just globally unique data associated with an item.
     263             :   bool operator==(const Item& other) const {
     264     2148504 :     return rule_ == other.rule_ && mark_ == other.mark_ &&
     265     2148504 :            start_ == other.start_ && pos_ == other.pos_;
     266             :   }
     267             : 
     268     4527423 :   friend size_t hash_value(const Item& i) {
     269     4527423 :     return base::hash_combine(i.rule_, i.mark_, i.start_, i.pos_);
     270             :   }
     271             : 
     272             :   const Rule* rule() const { return rule_; }
     273      461288 :   Symbol* left() const { return rule_->left(); }
     274             :   const std::vector<Symbol*>& right() const { return rule_->right(); }
     275             :   size_t pos() const { return pos_; }
     276             :   size_t start() const { return start_; }
     277             : 
     278             :  private:
     279             :   const Rule* rule_;
     280             :   size_t mark_;
     281             :   size_t start_;
     282             :   size_t pos_;
     283             : 
     284             :   const Item* prev_ = nullptr;
     285             :   const Item* child_ = nullptr;
     286             : };
     287             : 
     288      179845 : inline base::Optional<ParseResult> Symbol::RunAction(
     289      179870 :     const Item* item, const LexerResult& tokens) {
     290             :   DCHECK(item->IsComplete());
     291             :   DCHECK_EQ(item->left(), this);
     292      179870 :   return item->rule()->RunAction(item, tokens);
     293             : }
     294             : 
     295             : V8_EXPORT_PRIVATE const Item* RunEarleyAlgorithm(
     296             :     Symbol* start, const LexerResult& tokens,
     297             :     std::unordered_set<Item, base::hash<Item>>* processed);
     298             : 
     299          25 : inline base::Optional<ParseResult> ParseTokens(Symbol* start,
     300             :                                                const LexerResult& tokens) {
     301          25 :   std::unordered_set<Item, base::hash<Item>> table;
     302          25 :   const Item* final_item = RunEarleyAlgorithm(start, tokens, &table);
     303          25 :   return start->RunAction(final_item, tokens);
     304             : }
     305             : 
     306             : // The lexical syntax is dynamically defined while building the grammar by
     307             : // adding patterns and keywords to the Lexer.
     308             : // The term keyword here can stand for any fixed character sequence, including
     309             : // operators and parentheses.
     310             : // Each pattern or keyword automatically gets a terminal symbol associated with
     311             : // it. These symbols form the result of the lexing.
     312             : // Patterns and keywords are matched using the longest match principle. If the
     313             : // longest matching pattern coincides with a keyword, the keyword symbol is
     314             : // chosen instead of the pattern.
     315             : // In addition, there is a single whitespace pattern which is consumed but does
     316             : // not become part of the token list.
     317          72 : class Lexer {
     318             :  public:
     319             :   // Functions to define patterns. They try to match starting from {pos}. If
     320             :   // successful, they return true and advance {pos}. Otherwise, {pos} stays
     321             :   // unchanged.
     322             :   using PatternFunction = bool (*)(InputPosition* pos);
     323             : 
     324             :   void SetWhitespace(PatternFunction whitespace) {
     325          24 :     match_whitespace_ = whitespace;
     326             :   }
     327             : 
     328         116 :   Symbol* Pattern(PatternFunction pattern) { return &patterns_[pattern]; }
     329        5042 :   Symbol* Token(const std::string& keyword) { return &keywords_[keyword]; }
     330             :   V8_EXPORT_PRIVATE LexerResult RunLexer(const std::string& input);
     331             : 
     332             :  private:
     333           0 :   PatternFunction match_whitespace_ = [](InputPosition*) { return false; };
     334             :   std::map<PatternFunction, Symbol> patterns_;
     335             :   std::map<std::string, Symbol> keywords_;
     336             :   Symbol* MatchToken(InputPosition* pos, InputPosition end);
     337             : };
     338             : 
     339             : // A grammar can have a result, which is the results of the start symbol.
     340             : // Grammar is intended to be subclassed, with Symbol members forming the
     341             : // mutually recursive rules of the grammar.
     342          24 : class Grammar {
     343             :  public:
     344             :   using PatternFunction = Lexer::PatternFunction;
     345             : 
     346          24 :   explicit Grammar(Symbol* start) : start_(start) {}
     347             : 
     348          25 :   base::Optional<ParseResult> Parse(const std::string& input) {
     349          25 :     LexerResult tokens = lexer().RunLexer(input);
     350          25 :     return ParseTokens(start_, tokens);
     351             :   }
     352             : 
     353             :  protected:
     354             :   Symbol* Token(const std::string& s) { return lexer_.Token(s); }
     355             :   Symbol* Pattern(PatternFunction pattern) { return lexer_.Pattern(pattern); }
     356             :   void SetWhitespace(PatternFunction ws) { lexer_.SetWhitespace(ws); }
     357             : 
     358             :   // NewSymbol() allocates a fresh symbol and stores it in the current grammar.
     359             :   // This is necessary to define helpers that create new symbols.
     360        2875 :   Symbol* NewSymbol(std::initializer_list<Rule> rules = {}) {
     361        2875 :     Symbol* result = new Symbol(rules);
     362        5750 :     generated_symbols_.push_back(std::unique_ptr<Symbol>(result));
     363        2875 :     return result;
     364             :   }
     365             : 
     366             :   // Helper functions to define lexer patterns. If they match, they return true
     367             :   // and advance {pos}. Otherwise, {pos} is unchanged.
     368             :   V8_EXPORT_PRIVATE static bool MatchChar(int (*char_class)(int),
     369             :                                           InputPosition* pos);
     370             :   V8_EXPORT_PRIVATE static bool MatchChar(bool (*char_class)(char),
     371             :                                           InputPosition* pos);
     372             :   V8_EXPORT_PRIVATE static bool MatchAnyChar(InputPosition* pos);
     373             :   V8_EXPORT_PRIVATE static bool MatchString(const char* s, InputPosition* pos);
     374             : 
     375             :   // The action MatchInput() produces the input matched by the rule as
     376             :   // result.
     377       17866 :   static base::Optional<ParseResult> YieldMatchedInput(
     378             :       ParseResultIterator* child_results) {
     379       71464 :     return ParseResult{child_results->matched_input().ToString()};
     380             :   }
     381             : 
     382             :   // Create a new symbol to parse the given sequence of symbols.
     383             :   // At most one of the symbols can return a result.
     384         483 :   Symbol* Sequence(std::vector<Symbol*> symbols) {
     385         966 :     return NewSymbol({Rule(std::move(symbols))});
     386             :   }
     387             : 
     388             :   template <class T, T value>
     389        6521 :   static base::Optional<ParseResult> YieldIntegralConstant(
     390             :       ParseResultIterator* child_results) {
     391        6521 :     return ParseResult{value};
     392             :   }
     393             : 
     394             :   template <class T>
     395       22059 :   static base::Optional<ParseResult> YieldDefaultValue(
     396             :       ParseResultIterator* child_results) {
     397       54681 :     return ParseResult{T{}};
     398             :   }
     399             : 
     400             :   template <class From, class To>
     401       11506 :   static base::Optional<ParseResult> CastParseResult(
     402             :       ParseResultIterator* child_results) {
     403       16782 :     To result = std::move(child_results->NextAs<From>());
     404       22431 :     return ParseResult{std::move(result)};
     405             :   }
     406             : 
     407             :   // Try to parse {s} and return the result of type {Result} casted to {T}.
     408             :   // Otherwise, the result is a default-constructed {T}.
     409             :   template <class T, class Result = T>
     410        1173 :   Symbol* TryOrDefault(Symbol* s) {
     411             :     return NewSymbol({Rule({s}, CastParseResult<Result, T>),
     412        4692 :                       Rule({}, YieldDefaultValue<T>)});
     413             :   }
     414             : 
     415             :   template <class T>
     416        4385 :   static base::Optional<ParseResult> MakeSingletonVector(
     417             :       ParseResultIterator* child_results) {
     418        4385 :     T x = child_results->NextAs<T>();
     419         786 :     std::vector<T> result;
     420             :     result.push_back(std::move(x));
     421       16754 :     return ParseResult{std::move(result)};
     422             :   }
     423             : 
     424             :   template <class T>
     425        4638 :   static base::Optional<ParseResult> MakeExtendedVector(
     426             :       ParseResultIterator* child_results) {
     427        4638 :     std::vector<T> l = child_results->NextAs<std::vector<T>>();
     428        4659 :     T x = child_results->NextAs<T>();
     429             :     l.push_back(std::move(x));
     430       17908 :     return ParseResult{std::move(l)};
     431             :   }
     432             : 
     433             :   // For example, NonemptyList(Token("A"), Token(",")) parses any of
     434             :   // A or A,A or A,A,A and so on.
     435             :   template <class T>
     436         506 :   Symbol* NonemptyList(Symbol* element,
     437         506 :                        base::Optional<Symbol*> separator = {}) {
     438         506 :     Symbol* list = NewSymbol();
     439        3312 :     *list = {Rule({element}, MakeSingletonVector<T>),
     440             :              separator
     441             :                  ? Rule({list, *separator, element}, MakeExtendedVector<T>)
     442             :                  : Rule({list, element}, MakeExtendedVector<T>)};
     443         506 :     return list;
     444             :   }
     445             : 
     446             :   template <class T>
     447         414 :   Symbol* List(Symbol* element, base::Optional<Symbol*> separator = {}) {
     448         414 :     return TryOrDefault<std::vector<T>>(NonemptyList<T>(element, separator));
     449             :   }
     450             : 
     451             :   template <class T>
     452             :   Symbol* Optional(Symbol* x) {
     453         460 :     return TryOrDefault<base::Optional<T>, T>(x);
     454             :   }
     455             : 
     456         322 :   Symbol* CheckIf(Symbol* x) {
     457             :     return NewSymbol({Rule({x}, YieldIntegralConstant<bool, true>),
     458        1288 :                       Rule({}, YieldIntegralConstant<bool, false>)});
     459             :   }
     460             : 
     461             :   Lexer& lexer() { return lexer_; }
     462             : 
     463             :  private:
     464             :   Lexer lexer_;
     465             :   std::vector<std::unique_ptr<Symbol>> generated_symbols_;
     466             :   Symbol* start_;
     467             : };
     468             : 
     469             : }  // namespace torque
     470             : }  // namespace internal
     471             : }  // namespace v8
     472             : 
     473             : #endif  // V8_TORQUE_EARLEY_PARSER_H_

Generated by: LCOV version 1.10