LCOV - code coverage report
Current view: top level - src/torque - earley-parser.cc (source / functions) Hit Total Coverage
Test: app.info Lines: 114 133 85.7 %
Date: 2019-04-17 Functions: 13 14 92.9 %

          Line data    Source code
       1             : // Copyright 2018 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #include <algorithm>
       6             : #include <set>
       7             : #include <unordered_map>
       8             : #include <unordered_set>
       9             : 
      10             : #include "src/torque/ast.h"
      11             : #include "src/torque/earley-parser.h"
      12             : #include "src/torque/utils.h"
      13             : 
      14             : namespace v8 {
      15             : namespace internal {
      16             : namespace torque {
      17             : 
      18             : namespace {
      19             : 
      20             : struct LineAndColumnTracker {
      21             :   LineAndColumn previous{0, 0};
      22             :   LineAndColumn current{0, 0};
      23             : 
      24             :   void Advance(InputPosition from, InputPosition to) {
      25             :     previous = current;
      26      540357 :     while (from != to) {
      27      420707 :       if (*from == '\n') {
      28       11829 :         current.line += 1;
      29             :         current.column = 0;
      30             :       } else {
      31      408878 :         current.column += 1;
      32             :       }
      33      420707 :       ++from;
      34             :     }
      35             :   }
      36             : 
      37             :   SourcePosition ToSourcePosition() {
      38       59825 :     return {CurrentSourceFile::Get(), previous, current};
      39             :   }
      40             : };
      41             : 
      42             : }  // namespace
      43             : 
      44      252841 : base::Optional<ParseResult> Rule::RunAction(const Item* completed_item,
      45             :                                             const LexerResult& tokens) const {
      46      252841 :   std::vector<ParseResult> results;
      47      818207 :   for (const Item* child : completed_item->Children()) {
      48      372277 :     if (!child) continue;
      49             :     base::Optional<ParseResult> child_result =
      50             :         child->left()->RunAction(child, tokens);
      51      252773 :     if (child_result) results.push_back(std::move(*child_result));
      52             :   }
      53             :   MatchedInput matched_input = completed_item->GetMatchedInput(tokens);
      54      252841 :   CurrentSourcePosition::Scope pos_scope(matched_input.pos);
      55      505682 :   ParseResultIterator iterator(std::move(results), matched_input);
      56      505682 :   return action_(&iterator);
      57             : }
      58             : 
      59       16617 : Symbol& Symbol::operator=(std::initializer_list<Rule> rules) {
      60       16617 :   rules_.clear();
      61       52001 :   for (const Rule& rule : rules) {
      62       17692 :     AddRule(rule);
      63             :   }
      64       16617 :   return *this;
      65             : }
      66             : 
      67      252909 : std::vector<const Item*> Item::Children() const {
      68             :   std::vector<const Item*> children;
      69      878095 :   for (const Item* current = this; current->prev_; current = current->prev_) {
      70      312593 :     children.push_back(current->child_);
      71             :   }
      72             :   // The above loop collects the child nodes in reversed order.
      73             :   std::reverse(children.begin(), children.end());
      74             :   DCHECK_EQ(children.size(), right().size());
      75      252909 :   return children;
      76             : }
      77             : 
      78           0 : std::string Item::SplitByChildren(const LexerResult& tokens) const {
      79           0 :   if (right().size() == 1) {
      80           0 :     if (const Item* child = Children()[0])
      81           0 :       return child->SplitByChildren(tokens);
      82             :   }
      83           0 :   std::stringstream s;
      84             :   bool first = true;
      85           0 :   for (const Item* item : Children()) {
      86           0 :     if (!item) continue;
      87           0 :     if (!first) s << "  ";
      88           0 :     s << item->GetMatchedInput(tokens).ToString();
      89             :     first = false;
      90             :   }
      91             :   return s.str();
      92             : }
      93             : 
      94     1095059 : void Item::CheckAmbiguity(const Item& other, const LexerResult& tokens) const {
      95             :   DCHECK(*this == other);
      96     1095059 :   if (child_ != other.child_) {
      97           0 :     std::stringstream s;
      98             :     s << "Ambiguous grammer rules for \""
      99           0 :       << child_->GetMatchedInput(tokens).ToString() << "\":\n   "
     100           0 :       << child_->SplitByChildren(tokens) << "\nvs\n   "
     101           0 :       << other.child_->SplitByChildren(tokens);
     102           0 :     ReportError(s.str());
     103             :   }
     104     1095059 :   if (prev_ != other.prev_) {
     105           0 :     std::stringstream s;
     106           0 :     s << "Ambiguous grammer rules for \"" << GetMatchedInput(tokens).ToString()
     107           0 :       << "\":\n   " << SplitByChildren(tokens) << "  ...\nvs\n   "
     108           0 :       << other.SplitByChildren(tokens) << "  ...";
     109           0 :     ReportError(s.str());
     110             :   }
     111     1095059 : }
     112             : 
     113          70 : LexerResult Lexer::RunLexer(const std::string& input) {
     114             :   LexerResult result;
     115             :   InputPosition const begin = input.c_str();
     116          70 :   InputPosition const end = begin + input.size();
     117          70 :   InputPosition pos = begin;
     118             :   InputPosition token_start = pos;
     119             :   LineAndColumnTracker line_column_tracker;
     120             : 
     121          70 :   match_whitespace_(&pos);
     122          70 :   line_column_tracker.Advance(token_start, pos);
     123      119580 :   while (pos != end) {
     124             :     token_start = pos;
     125       59756 :     Symbol* symbol = MatchToken(&pos, end);
     126       59756 :     InputPosition token_end = pos;
     127             :     line_column_tracker.Advance(token_start, token_end);
     128       59756 :     if (!symbol) {
     129             :       CurrentSourcePosition::Scope pos_scope(
     130           2 :           line_column_tracker.ToSourcePosition());
     131           2 :       ReportError("Lexer Error: unknown token " +
     132           4 :                   StringLiteralQuote(std::string(
     133           1 :                       token_start, token_start + std::min<ptrdiff_t>(
     134           3 :                                                      end - token_start, 10))));
     135             :     }
     136       59755 :     result.token_symbols.push_back(symbol);
     137      179266 :     result.token_contents.push_back(
     138             :         {token_start, pos, line_column_tracker.ToSourcePosition()});
     139       59755 :     match_whitespace_(&pos);
     140       59755 :     line_column_tracker.Advance(token_end, pos);
     141             :   }
     142             : 
     143             :   // Add an additional token position to simplify corner cases.
     144             :   line_column_tracker.Advance(token_start, pos);
     145         139 :   result.token_contents.push_back(
     146             :       {pos, pos, line_column_tracker.ToSourcePosition()});
     147          69 :   return result;
     148             : }
     149             : 
     150       59756 : Symbol* Lexer::MatchToken(InputPosition* pos, InputPosition end) {
     151       59756 :   InputPosition token_start = *pos;
     152             :   Symbol* symbol = nullptr;
     153             :   // Find longest matching pattern.
     154      358306 :   for (std::pair<const PatternFunction, Symbol>& pair : patterns_) {
     155      298550 :     InputPosition token_end = token_start;
     156      298550 :     PatternFunction matchPattern = pair.first;
     157      298550 :     if (matchPattern(&token_end) && token_end > *pos) {
     158       29682 :       *pos = token_end;
     159       29682 :       symbol = &pair.second;
     160             :     }
     161             :   }
     162             :   // Check if matched pattern coincides with a keyword. Prefer the keyword in
     163             :   // this case.
     164       59756 :   if (*pos != token_start) {
     165       29664 :     auto found_keyword = keywords_.find(std::string(token_start, *pos));
     166       29664 :     if (found_keyword != keywords_.end()) {
     167        7538 :       return &found_keyword->second;
     168             :     }
     169             :     return symbol;
     170             :   }
     171             :   // Now check for a keyword (that doesn't overlap with a pattern).
     172             :   // Iterate from the end to ensure that if one keyword is a prefix of another,
     173             :   // we first try to match the longer one.
     174     1995013 :   for (auto it = keywords_.rbegin(); it != keywords_.rend(); ++it) {
     175     1995012 :     const std::string& keyword = it->first;
     176     1995012 :     if (static_cast<size_t>(end - *pos) < keyword.size()) continue;
     177     5983374 :     if (keyword == std::string(*pos, *pos + keyword.size())) {
     178       30091 :       *pos += keyword.size();
     179       30091 :       return &it->second;
     180             :     }
     181             :   }
     182             :   return nullptr;
     183             : }
     184             : 
     185             : // This is an implementation of Earley's parsing algorithm
     186             : // (https://en.wikipedia.org/wiki/Earley_parser).
     187          69 : const Item* RunEarleyAlgorithm(
     188             :     Symbol* start, const LexerResult& tokens,
     189             :     std::unordered_set<Item, base::hash<Item>>* processed) {
     190             :   // Worklist for items at the current position.
     191             :   std::vector<Item> worklist;
     192             :   // Worklist for items at the next position.
     193             :   std::vector<Item> future_items;
     194             :   CurrentSourcePosition::Scope source_position(
     195          70 :       SourcePosition{CurrentSourceFile::Get(), {0, 0}, {0, 0}});
     196             :   std::vector<const Item*> completed_items;
     197             :   std::unordered_map<std::pair<size_t, Symbol*>, std::set<const Item*>,
     198             :                      base::hash<std::pair<size_t, Symbol*>>>
     199             :       waiting;
     200             : 
     201             :   std::vector<const Item*> debug_trace;
     202             : 
     203             :   // Start with one top_level symbol mapping to the start symbol of the grammar.
     204             :   // This simplifies things because the start symbol might have several
     205             :   // rules.
     206             :   Symbol top_level;
     207         276 :   top_level.AddRule(Rule({start}));
     208          70 :   worklist.push_back(Item{top_level.rule(0), 0, 0, 0});
     209             : 
     210             :   size_t input_length = tokens.token_symbols.size();
     211             : 
     212      119715 :   for (size_t pos = 0; pos <= input_length; ++pos) {
     213     3902793 :     while (!worklist.empty()) {
     214             :       auto insert_result = processed->insert(worklist.back());
     215             :       const Item& item = *insert_result.first;
     216             :       DCHECK_EQ(pos, item.pos());
     217     3842970 :       MatchedInput last_token = tokens.token_contents[pos];
     218     3842970 :       CurrentSourcePosition::Get() = last_token.pos;
     219     3842970 :       bool is_new = insert_result.second;
     220     4938029 :       if (!is_new) item.CheckAmbiguity(worklist.back(), tokens);
     221             :       worklist.pop_back();
     222     3842970 :       if (!is_new) continue;
     223             : 
     224     5495822 :       debug_trace.push_back(&item);
     225     2747911 :       if (item.IsComplete()) {
     226             :         // 'Complete' phase: Advance all items that were waiting to match this
     227             :         // symbol next.
     228     1544664 :         for (const Item* parent : waiting[{item.start(), item.left()}]) {
     229      694578 :           worklist.push_back(parent->Advance(pos, &item));
     230             :         }
     231             :       } else {
     232             :         Symbol* next = item.NextSymbol();
     233             :         // 'Scan' phase: Check if {next} is the next symbol in the input (this
     234             :         // is never the case if {next} is a non-terminal).
     235     4643840 :         if (pos < tokens.token_symbols.size() &&
     236     2320972 :             tokens.token_symbols[pos] == next) {
     237      136338 :           future_items.push_back(item.Advance(pos + 1, nullptr));
     238             :         }
     239             :         // 'Predict' phase: Add items for every rule of the non-terminal.
     240     2322868 :         if (!next->IsTerminal()) {
     241             :           // Remember that this item is waiting for completion with {next}.
     242     2327796 :           waiting[{pos, next}].insert(&item);
     243             :         }
     244     8483176 :         for (size_t i = 0; i < next->rule_number(); ++i) {
     245             :           Rule* rule = next->rule(i);
     246             :           auto already_completed =
     247     3080154 :               processed->find(Item{rule, rule->right().size(), pos, pos});
     248             :           // As discussed in section 3 of
     249             :           //    Aycock, John, and R. Nigel Horspool. "Practical earley
     250             :           //    parsing." The Computer Journal 45.6 (2002): 620-630.
     251             :           // Earley parsing has the following problem with epsilon rules:
     252             :           // When we complete an item that started at the current position
     253             :           // (that is, it matched zero tokens), we might not yet have
     254             :           // predicted all items it can complete with. Thus we check for the
     255             :           // existence of such items here and complete them immediately.
     256     3080154 :           if (already_completed != processed->end()) {
     257        2062 :             worklist.push_back(item.Advance(pos, &*already_completed));
     258             :           } else {
     259     3078092 :             worklist.push_back(Item{rule, 0, pos, pos});
     260             :           }
     261             :         }
     262             :       }
     263             :     }
     264             :     std::swap(worklist, future_items);
     265             :   }
     266             : 
     267             :   auto final_item =
     268          70 :       processed->find(Item{top_level.rule(0), 1, 0, input_length});
     269          69 :   if (final_item != processed->end()) {
     270             :     // Success: The {top_level} rule matches the complete input.
     271         204 :     return final_item->Children()[0];
     272             :   }
     273             :   std::string reason;
     274           1 :   const Item& last_item = *debug_trace.back();
     275           1 :   if (last_item.pos() < tokens.token_symbols.size()) {
     276             :     std::string next_token = tokens.token_contents[last_item.pos()].ToString();
     277           3 :     reason = "unexpected token \"" + next_token + "\"";
     278             :   } else {
     279             :     reason = "unexpected end of input";
     280             :   }
     281           2 :   ReportError("Parser Error: " + reason);
     282             : }
     283             : 
     284             : // static
     285      538534 : bool Grammar::MatchChar(int (*char_class)(int), InputPosition* pos) {
     286      538534 :   if (**pos && char_class(static_cast<unsigned char>(**pos))) {
     287      296989 :     ++*pos;
     288      296989 :     return true;
     289             :   }
     290             :   return false;
     291             : }
     292             : 
     293             : // static
     294       89004 : bool Grammar::MatchChar(bool (*char_class)(char), InputPosition* pos) {
     295       89007 :   if (**pos && char_class(**pos)) {
     296       86846 :     ++*pos;
     297       86843 :     return true;
     298             :   }
     299             :   return false;
     300             : }
     301             : 
     302             : // static
     303      518780 : bool Grammar::MatchString(const char* s, InputPosition* pos) {
     304      518780 :   InputPosition current = *pos;
     305      532508 :   for (; *s != 0; ++s, ++current) {
     306      520939 :     if (*s != *current) return false;
     307             :   }
     308        4705 :   *pos = current;
     309        4705 :   return true;
     310             : }
     311             : 
     312             : // static
     313           3 : bool Grammar::MatchAnyChar(InputPosition* pos) {
     314           6 :   return MatchChar([](char c) { return true; }, pos);
     315             : }
     316             : 
     317             : }  // namespace torque
     318             : }  // namespace internal
     319       59456 : }  // namespace v8

Generated by: LCOV version 1.10