LCOV - app.info - src/torque/earley-parser.cc

LCOV - code coverage report

Current view:	top level - src/torque - earley-parser.cc (source / functions)		Hit	Total	Coverage
Test:	app.info	Lines:	110	137	80.3 %
Date:	2019-01-20	Functions:	14	15	93.3 %

          Line data    Source code

       1             : // Copyright 2018 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #include <algorithm>
       6             : #include <set>
       7             : #include <unordered_map>
       8             : #include <unordered_set>
       9             : 
      10             : #include "src/torque/ast.h"
      11             : #include "src/torque/earley-parser.h"
      12             : #include "src/torque/utils.h"
      13             : 
      14             : namespace v8 {
      15             : namespace internal {
      16             : namespace torque {
      17             : 
      18             : namespace {
      19             : 
      20             : void UpdateSourcePosition(InputPosition from, InputPosition to,
      21             :                           SourcePosition* pos) {
      22      326466 :   while (from != to) {
      23      284005 :     if (*from == '\n') {
      24        7934 :       pos->line += 1;
      25        7934 :       pos->column = 0;
      26             :     } else {
      27      276071 :       pos->column += 1;
      28             :     }
      29      284005 :     ++from;
      30             :   }
      31             : }
      32             : 
      33             : }  // namespace
      34             : 
      35      359740 : base::Optional<ParseResult> Rule::RunAction(const Item* completed_item,
      36             :                                             const LexerResult& tokens) const {
      37             :   std::vector<ParseResult> results;
      38      941736 :   for (const Item* child : completed_item->Children()) {
      39      264717 :     if (!child) continue;
      40             :     base::Optional<ParseResult> child_result =
      41      179845 :         child->left()->RunAction(child, tokens);
      42      179845 :     if (child_result) results.push_back(std::move(*child_result));
      43             :   }
      44             :   MatchedInput matched_input = completed_item->GetMatchedInput(tokens);
      45      179870 :   CurrentSourcePosition::Scope pos_scope(matched_input.pos);
      46      359740 :   ParseResultIterator iterator(std::move(results), matched_input);
      47      359740 :   return action_(&iterator);
      48             : }
      49             : 
      50        6866 : Symbol& Symbol::operator=(std::initializer_list<Rule> rules) {
      51             :   rules_.clear();
      52       14050 :   for (const Rule& rule : rules) {
      53        7184 :     AddRule(rule);
      54             :   }
      55        6866 :   return *this;
      56             : }
      57             : 
      58      179895 : std::vector<const Item*> Item::Children() const {
      59             :   std::vector<const Item*> children;
      60      402201 :   for (const Item* current = this; current->prev_; current = current->prev_) {
      61      222306 :     children.push_back(current->child_);
      62             :   }
      63             :   // The above loop collects the child nodes in reversed order.
      64             :   std::reverse(children.begin(), children.end());
      65             :   DCHECK_EQ(children.size(), right().size());
      66      179895 :   return children;
      67             : }
      68             : 
      69           0 : std::string Item::SplitByChildren(const LexerResult& tokens) const {
      70           0 :   if (right().size() == 1) {
      71           0 :     if (const Item* child = Children()[0])
      72           0 :       return child->SplitByChildren(tokens);
      73             :   }
      74           0 :   std::stringstream s;
      75             :   bool first = true;
      76           0 :   for (const Item* item : Children()) {
      77           0 :     if (!item) continue;
      78           0 :     if (!first) s << "  ";
      79             :     s << item->GetMatchedInput(tokens).ToString();
      80             :     first = false;
      81             :   }
      82           0 :   return s.str();
      83             : }
      84             : 
      85      715486 : void Item::CheckAmbiguity(const Item& other, const LexerResult& tokens) const {
      86             :   DCHECK(*this == other);
      87      715486 :   if (child_ != other.child_) {
      88           0 :     std::stringstream s;
      89           0 :     s << "Ambiguous grammer rules for \""
      90           0 :       << child_->GetMatchedInput(tokens).ToString() << "\":\n   "
      91           0 :       << child_->SplitByChildren(tokens) << "\nvs\n   "
      92           0 :       << other.child_->SplitByChildren(tokens);
      93           0 :     ReportError(s.str());
      94             :   }
      95      715486 :   if (prev_ != other.prev_) {
      96           0 :     std::stringstream s;
      97           0 :     s << "Ambiguous grammer rules for \"" << GetMatchedInput(tokens).ToString()
      98           0 :       << "\":\n   " << SplitByChildren(tokens) << "  ...\nvs\n   "
      99           0 :       << other.SplitByChildren(tokens) << "  ...";
     100           0 :     ReportError(s.str());
     101             :   }
     102      715486 : }
     103             : 
     104          25 : LexerResult Lexer::RunLexer(const std::string& input) {
     105             :   LexerResult result;
     106             :   InputPosition const begin = input.c_str();
     107          25 :   InputPosition const end = begin + input.size();
     108          25 :   InputPosition pos = begin;
     109             :   InputPosition token_start = pos;
     110             :   CurrentSourcePosition::Scope scope(
     111          25 :       SourcePosition{CurrentSourceFile::Get(), 0, 0});
     112          25 :   match_whitespace_(&pos);
     113       42486 :   while (pos != end) {
     114       42436 :     UpdateSourcePosition(token_start, pos, &CurrentSourcePosition::Get());
     115       42436 :     token_start = pos;
     116       42436 :     Symbol* symbol = MatchToken(&pos, end);
     117       42436 :     if (!symbol) {
     118           0 :       ReportError("Lexer Error: unknown token " +
     119             :                   StringLiteralQuote(std::string(
     120             :                       token_start, token_start + std::min<ptrdiff_t>(
     121           0 :                                                      end - token_start, 10))));
     122             :     }
     123       42436 :     result.token_symbols.push_back(symbol);
     124             :     result.token_contents.push_back(
     125      127308 :         {token_start, pos, CurrentSourcePosition::Get()});
     126       42436 :     match_whitespace_(&pos);
     127             :   }
     128          25 :   UpdateSourcePosition(token_start, pos, &CurrentSourcePosition::Get());
     129             :   // Add an additional token position to simplify corner cases.
     130          75 :   result.token_contents.push_back({pos, pos, CurrentSourcePosition::Get()});
     131          25 :   return result;
     132             : }
     133             : 
     134       42436 : Symbol* Lexer::MatchToken(InputPosition* pos, InputPosition end) {
     135       42436 :   InputPosition token_start = *pos;
     136             :   Symbol* symbol = nullptr;
     137             :   // Find longest matching pattern.
     138      296924 :   for (std::pair<const PatternFunction, Symbol>& pair : patterns_) {
     139      212052 :     InputPosition token_end = token_start;
     140      212052 :     PatternFunction matchPattern = pair.first;
     141      212052 :     if (matchPattern(&token_end) && token_end > *pos) {
     142       21003 :       *pos = token_end;
     143       21003 :       symbol = &pair.second;
     144             :     }
     145             :   }
     146             :   // Check if matched pattern coincides with a keyword. Prefer the keyword in
     147             :   // this case.
     148       42436 :   if (*pos != token_start) {
     149       20985 :     auto found_keyword = keywords_.find(std::string(token_start, *pos));
     150       20985 :     if (found_keyword != keywords_.end()) {
     151        5171 :       return &found_keyword->second;
     152             :     }
     153             :     return symbol;
     154             :   }
     155             :   // Now check for a keyword (that doesn't overlap with a pattern).
     156             :   // Iterate from the end to ensure that if one keyword is a prefix of another,
     157             :   // we first try to match the longer one.
     158     1423971 :   for (auto it = keywords_.rbegin(); it != keywords_.rend(); ++it) {
     159     1423971 :     const std::string& keyword = it->first;
     160     2847942 :     if (static_cast<size_t>(end - *pos) < keyword.size()) continue;
     161     4270827 :     if (keyword == std::string(*pos, *pos + keyword.size())) {
     162       21451 :       *pos += keyword.size();
     163       21451 :       return &it->second;
     164             :     }
     165             :   }
     166             :   return nullptr;
     167             : }
     168             : 
     169             : // This is an implementation of Earley's parsing algorithm
     170             : // (https://en.wikipedia.org/wiki/Earley_parser).
     171          25 : const Item* RunEarleyAlgorithm(
     172             :     Symbol* start, const LexerResult& tokens,
     173             :     std::unordered_set<Item, base::hash<Item>>* processed) {
     174             :   // Worklist for items at the current position.
     175             :   std::vector<Item> worklist;
     176             :   // Worklist for items at the next position.
     177             :   std::vector<Item> future_items;
     178             :   CurrentSourcePosition::Scope source_position(
     179          25 :       SourcePosition{CurrentSourceFile::Get(), 0, 0});
     180             :   std::vector<const Item*> completed_items;
     181             :   std::unordered_map<std::pair<size_t, Symbol*>, std::set<const Item*>,
     182             :                      base::hash<std::pair<size_t, Symbol*>>>
     183          25 :       waiting;
     184             : 
     185             :   std::vector<const Item*> debug_trace;
     186             : 
     187             :   // Start with one top_level symbol mapping to the start symbol of the grammar.
     188             :   // This simplifies things because the start symbol might have several
     189             :   // rules.
     190          25 :   Symbol top_level;
     191         100 :   top_level.AddRule(Rule({start}));
     192          25 :   worklist.push_back(Item{top_level.rule(0), 0, 0, 0});
     193             : 
     194     1526840 :   size_t input_length = tokens.token_symbols.size();
     195             : 
     196       42486 :   for (size_t pos = 0; pos <= input_length; ++pos) {
     197     2566205 :     while (!worklist.empty()) {
     198     2523744 :       auto insert_result = processed->insert(worklist.back());
     199     2805187 :       const Item& item = *insert_result.first;
     200             :       DCHECK_EQ(pos, item.pos());
     201     5047488 :       MatchedInput last_token = tokens.token_contents[pos];
     202     2523744 :       CurrentSourcePosition::Get() = last_token.pos;
     203     2523744 :       bool is_new = insert_result.second;
     204     3239230 :       if (!is_new) item.CheckAmbiguity(worklist.back(), tokens);
     205             :       worklist.pop_back();
     206     2523744 :       if (!is_new) continue;
     207             : 
     208     3616516 :       debug_trace.push_back(&item);
     209     1808258 :       if (item.IsComplete()) {
     210             :         // 'Complete' phase: Advance all items that were waiting to match this
     211             :         // symbol next.
     212     1316090 :         for (const Item* parent : waiting[{item.start(), item.left()}]) {
     213      471761 :           worklist.push_back(parent->Advance(pos, &item));
     214             :         }
     215             :       } else {
     216             :         Symbol* next = item.NextSymbol();
     217             :         // 'Scan' phase: Check if {next} is the next symbol in the input (this
     218             :         // is never the case if {next} is a non-terminal).
     219     3052911 :         if (pos < tokens.token_symbols.size() &&
     220     1526096 :             tokens.token_symbols[pos] == next) {
     221       96608 :           future_items.push_back(item.Advance(pos + 1, nullptr));
     222             :         }
     223             :         // 'Predict' phase: Add items for every rule of the non-terminal.
     224     1526815 :         if (!next->IsTerminal()) {
     225             :           // Remember that this item is waiting for completion with {next}.
     226     1639644 :           waiting[{pos, next}].insert(&item);
     227             :         }
     228     5534123 :         for (size_t i = 0; i < next->rule_number(); ++i) {
     229             :           Rule* rule = next->rule(i);
     230             :           auto already_completed =
     231     4007308 :               processed->find(Item{rule, rule->right().size(), pos, pos});
     232             :           // As discussed in section 3 of
     233             :           //    Aycock, John, and R. Nigel Horspool. "Practical earley
     234             :           //    parsing." The Computer Journal 45.6 (2002): 620-630.
     235             :           // Earley parsing has the following problem with epsilon rules:
     236             :           // When we complete an item that started at the current position
     237             :           // (that is, it matched zero tokens), we might not yet have
     238             :           // predicted all items it can complete with. Thus we check for the
     239             :           // existence of such items here and complete them immediately.
     240     2003654 :           if (already_completed != processed->end()) {
     241        1314 :             worklist.push_back(item.Advance(pos, &*already_completed));
     242             :           } else {
     243     2002997 :             worklist.push_back(Item{rule, 0, pos, pos});
     244             :           }
     245             :         }
     246             :       }
     247             :     }
     248             :     std::swap(worklist, future_items);
     249             :   }
     250             : 
     251             :   auto final_item =
     252          25 :       processed->find(Item{top_level.rule(0), 1, 0, input_length});
     253          25 :   if (final_item != processed->end()) {
     254             :     // Success: The {top_level} rule matches the complete input.
     255          75 :     return final_item->Children()[0];
     256             :   }
     257             :   std::string reason;
     258           0 :   const Item& last_item = *debug_trace.back();
     259           0 :   if (last_item.pos() < tokens.token_symbols.size()) {
     260           0 :     std::string next_token = tokens.token_contents[last_item.pos()].ToString();
     261           0 :     reason = "unexpected token \"" + next_token + "\"";
     262             :   } else {
     263             :     reason = "unexpected end of input";
     264             :   }
     265           0 :   ReportError("Parser Error: " + reason);
     266             : }
     267             : 
     268             : // static
     269      378722 : bool Grammar::MatchChar(int (*char_class)(int), InputPosition* pos) {
     270      378722 :   if (**pos && char_class(static_cast<unsigned char>(**pos))) {
     271      207612 :     ++*pos;
     272      207612 :     return true;
     273             :   }
     274             :   return false;
     275             : }
     276             : 
     277             : // static
     278       52322 : bool Grammar::MatchChar(bool (*char_class)(char), InputPosition* pos) {
     279       52323 :   if (**pos && char_class(**pos)) {
     280       51009 :     ++*pos;
     281       51008 :     return true;
     282             :   }
     283             :   return false;
     284             : }
     285             : 
     286             : // static
     287      367519 : bool Grammar::MatchString(const char* s, InputPosition* pos) {
     288      367519 :   InputPosition current = *pos;
     289      371539 :   for (; *s != 0; ++s, ++current) {
     290      368857 :     if (*s != *current) return false;
     291             :   }
     292        2682 :   *pos = current;
     293        2682 :   return true;
     294             : }
     295             : 
     296             : // static
     297           1 : bool Grammar::MatchAnyChar(InputPosition* pos) {
     298           2 :   return MatchChar([](char c) { return true; }, pos);
     299             : }
     300             : 
     301             : }  // namespace torque
     302             : }  // namespace internal
     303        9078 : }  // namespace v8

Generated by: LCOV version 1.10