Line data Source code
1 : // Copyright 2018 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include <algorithm>
6 : #include <set>
7 : #include <unordered_map>
8 : #include <unordered_set>
9 :
10 : #include "src/torque/ast.h"
11 : #include "src/torque/earley-parser.h"
12 : #include "src/torque/utils.h"
13 :
14 : namespace v8 {
15 : namespace internal {
16 : namespace torque {
17 :
18 : namespace {
19 :
20 : struct LineAndColumnTracker {
21 : LineAndColumn previous{0, 0};
22 : LineAndColumn current{0, 0};
23 :
24 : void Advance(InputPosition from, InputPosition to) {
25 : previous = current;
26 617095 : while (from != to) {
27 522255 : if (*from == '\n') {
28 8881 : current.line += 1;
29 : current.column = 0;
30 : } else {
31 513374 : current.column += 1;
32 : }
33 522255 : ++from;
34 : }
35 : }
36 :
37 : SourcePosition ToSourcePosition() {
38 47435 : return {CurrentSourceFile::Get(), previous, current};
39 : }
40 : };
41 :
42 : } // namespace
43 :
44 390472 : base::Optional<ParseResult> Rule::RunAction(const Item* completed_item,
45 : const LexerResult& tokens) const {
46 : std::vector<ParseResult> results;
47 1023525 : for (const Item* child : completed_item->Children()) {
48 290016 : if (!child) continue;
49 : base::Optional<ParseResult> child_result =
50 195206 : child->left()->RunAction(child, tokens);
51 195206 : if (child_result) results.push_back(std::move(*child_result));
52 : }
53 : MatchedInput matched_input = completed_item->GetMatchedInput(tokens);
54 195236 : CurrentSourcePosition::Scope pos_scope(matched_input.pos);
55 390472 : ParseResultIterator iterator(std::move(results), matched_input);
56 390472 : return action_(&iterator);
57 : }
58 :
59 8496 : Symbol& Symbol::operator=(std::initializer_list<Rule> rules) {
60 : rules_.clear();
61 17464 : for (const Rule& rule : rules) {
62 8968 : AddRule(rule);
63 : }
64 8496 : return *this;
65 : }
66 :
67 195266 : std::vector<const Item*> Item::Children() const {
68 : std::vector<const Item*> children;
69 437907 : for (const Item* current = this; current->prev_; current = current->prev_) {
70 242641 : children.push_back(current->child_);
71 : }
72 : // The above loop collects the child nodes in reversed order.
73 : std::reverse(children.begin(), children.end());
74 : DCHECK_EQ(children.size(), right().size());
75 195266 : return children;
76 : }
77 :
78 0 : std::string Item::SplitByChildren(const LexerResult& tokens) const {
79 0 : if (right().size() == 1) {
80 0 : if (const Item* child = Children()[0])
81 0 : return child->SplitByChildren(tokens);
82 : }
83 0 : std::stringstream s;
84 : bool first = true;
85 0 : for (const Item* item : Children()) {
86 0 : if (!item) continue;
87 0 : if (!first) s << " ";
88 : s << item->GetMatchedInput(tokens).ToString();
89 : first = false;
90 : }
91 0 : return s.str();
92 : }
93 :
94 774612 : void Item::CheckAmbiguity(const Item& other, const LexerResult& tokens) const {
95 : DCHECK(*this == other);
96 774612 : if (child_ != other.child_) {
97 0 : std::stringstream s;
98 0 : s << "Ambiguous grammer rules for \""
99 0 : << child_->GetMatchedInput(tokens).ToString() << "\":\n "
100 0 : << child_->SplitByChildren(tokens) << "\nvs\n "
101 0 : << other.child_->SplitByChildren(tokens);
102 0 : ReportError(s.str());
103 : }
104 774612 : if (prev_ != other.prev_) {
105 0 : std::stringstream s;
106 0 : s << "Ambiguous grammer rules for \"" << GetMatchedInput(tokens).ToString()
107 0 : << "\":\n " << SplitByChildren(tokens) << " ...\nvs\n "
108 0 : << other.SplitByChildren(tokens) << " ...";
109 0 : ReportError(s.str());
110 : }
111 774612 : }
112 :
113 30 : LexerResult Lexer::RunLexer(const std::string& input) {
114 : LexerResult result;
115 : InputPosition const begin = input.c_str();
116 30 : InputPosition const end = begin + input.size();
117 30 : InputPosition pos = begin;
118 : InputPosition token_start = pos;
119 : LineAndColumnTracker line_column_tracker;
120 :
121 30 : match_whitespace_(&pos);
122 47465 : while (pos != end) {
123 : line_column_tracker.Advance(token_start, pos);
124 : token_start = pos;
125 47405 : Symbol* symbol = MatchToken(&pos, end);
126 47405 : if (!symbol) {
127 0 : ReportError("Lexer Error: unknown token " +
128 : StringLiteralQuote(std::string(
129 : token_start, token_start + std::min<ptrdiff_t>(
130 0 : end - token_start, 10))));
131 : }
132 47405 : line_column_tracker.Advance(token_start, pos);
133 47405 : result.token_symbols.push_back(symbol);
134 : result.token_contents.push_back(
135 142215 : {token_start, pos, line_column_tracker.ToSourcePosition()});
136 47405 : match_whitespace_(&pos);
137 : }
138 :
139 : // Add an additional token position to simplify corner cases.
140 : line_column_tracker.Advance(token_start, pos);
141 : result.token_contents.push_back(
142 90 : {pos, pos, line_column_tracker.ToSourcePosition()});
143 30 : return result;
144 : }
145 :
146 47405 : Symbol* Lexer::MatchToken(InputPosition* pos, InputPosition end) {
147 47405 : InputPosition token_start = *pos;
148 : Symbol* symbol = nullptr;
149 : // Find longest matching pattern.
150 331707 : for (std::pair<const PatternFunction, Symbol>& pair : patterns_) {
151 236897 : InputPosition token_end = token_start;
152 236897 : PatternFunction matchPattern = pair.first;
153 236897 : if (matchPattern(&token_end) && token_end > *pos) {
154 23412 : *pos = token_end;
155 23412 : symbol = &pair.second;
156 : }
157 : }
158 : // Check if matched pattern coincides with a keyword. Prefer the keyword in
159 : // this case.
160 47405 : if (*pos != token_start) {
161 23394 : auto found_keyword = keywords_.find(std::string(token_start, *pos));
162 23394 : if (found_keyword != keywords_.end()) {
163 5808 : return &found_keyword->second;
164 : }
165 : return symbol;
166 : }
167 : // Now check for a keyword (that doesn't overlap with a pattern).
168 : // Iterate from the end to ensure that if one keyword is a prefix of another,
169 : // we first try to match the longer one.
170 1588925 : for (auto it = keywords_.rbegin(); it != keywords_.rend(); ++it) {
171 1588925 : const std::string& keyword = it->first;
172 3177850 : if (static_cast<size_t>(end - *pos) < keyword.size()) continue;
173 4765701 : if (keyword == std::string(*pos, *pos + keyword.size())) {
174 24011 : *pos += keyword.size();
175 24011 : return &it->second;
176 : }
177 : }
178 : return nullptr;
179 : }
180 :
181 : // This is an implementation of Earley's parsing algorithm
182 : // (https://en.wikipedia.org/wiki/Earley_parser).
183 30 : const Item* RunEarleyAlgorithm(
184 : Symbol* start, const LexerResult& tokens,
185 : std::unordered_set<Item, base::hash<Item>>* processed) {
186 : // Worklist for items at the current position.
187 : std::vector<Item> worklist;
188 : // Worklist for items at the next position.
189 : std::vector<Item> future_items;
190 : CurrentSourcePosition::Scope source_position(
191 30 : SourcePosition{CurrentSourceFile::Get(), {0, 0}, {0, 0}});
192 : std::vector<const Item*> completed_items;
193 : std::unordered_map<std::pair<size_t, Symbol*>, std::set<const Item*>,
194 : base::hash<std::pair<size_t, Symbol*>>>
195 30 : waiting;
196 :
197 : std::vector<const Item*> debug_trace;
198 :
199 : // Start with one top_level symbol mapping to the start symbol of the grammar.
200 : // This simplifies things because the start symbol might have several
201 : // rules.
202 30 : Symbol top_level;
203 120 : top_level.AddRule(Rule({start}));
204 30 : worklist.push_back(Item{top_level.rule(0), 0, 0, 0});
205 :
206 1682386 : size_t input_length = tokens.token_symbols.size();
207 :
208 47465 : for (size_t pos = 0; pos <= input_length; ++pos) {
209 2821129 : while (!worklist.empty()) {
210 2773694 : auto insert_result = processed->insert(worklist.back());
211 3090420 : const Item& item = *insert_result.first;
212 : DCHECK_EQ(pos, item.pos());
213 5547388 : MatchedInput last_token = tokens.token_contents[pos];
214 2773694 : CurrentSourcePosition::Get() = last_token.pos;
215 2773694 : bool is_new = insert_result.second;
216 3548306 : if (!is_new) item.CheckAmbiguity(worklist.back(), tokens);
217 : worklist.pop_back();
218 2773694 : if (!is_new) continue;
219 :
220 3998164 : debug_trace.push_back(&item);
221 1999082 : if (item.IsComplete()) {
222 : // 'Complete' phase: Advance all items that were waiting to match this
223 : // symbol next.
224 1476686 : for (const Item* parent : waiting[{item.start(), item.left()}]) {
225 526508 : worklist.push_back(parent->Advance(pos, &item));
226 : }
227 : } else {
228 : Symbol* next = item.NextSymbol();
229 : // 'Scan' phase: Check if {next} is the next symbol in the input (this
230 : // is never the case if {next} is a non-terminal).
231 3363782 : if (pos < tokens.token_symbols.size() &&
232 1681426 : tokens.token_symbols[pos] == next) {
233 107928 : future_items.push_back(item.Advance(pos + 1, nullptr));
234 : }
235 : // 'Predict' phase: Add items for every rule of the non-terminal.
236 1682356 : if (!next->IsTerminal()) {
237 : // Remember that this item is waiting for completion with {next}.
238 1797908 : waiting[{pos, next}].insert(&item);
239 : }
240 6068740 : for (size_t i = 0; i < next->rule_number(); ++i) {
241 : Rule* rule = next->rule(i);
242 : auto already_completed =
243 4386384 : processed->find(Item{rule, rule->right().size(), pos, pos});
244 : // As discussed in section 3 of
245 : // Aycock, John, and R. Nigel Horspool. "Practical earley
246 : // parsing." The Computer Journal 45.6 (2002): 620-630.
247 : // Earley parsing has the following problem with epsilon rules:
248 : // When we complete an item that started at the current position
249 : // (that is, it matched zero tokens), we might not yet have
250 : // predicted all items it can complete with. Thus we check for the
251 : // existence of such items here and complete them immediately.
252 2193192 : if (already_completed != processed->end()) {
253 3254 : worklist.push_back(item.Advance(pos, &*already_completed));
254 : } else {
255 2191565 : worklist.push_back(Item{rule, 0, pos, pos});
256 : }
257 : }
258 : }
259 : }
260 : std::swap(worklist, future_items);
261 : }
262 :
263 : auto final_item =
264 30 : processed->find(Item{top_level.rule(0), 1, 0, input_length});
265 30 : if (final_item != processed->end()) {
266 : // Success: The {top_level} rule matches the complete input.
267 90 : return final_item->Children()[0];
268 : }
269 : std::string reason;
270 0 : const Item& last_item = *debug_trace.back();
271 0 : if (last_item.pos() < tokens.token_symbols.size()) {
272 0 : std::string next_token = tokens.token_contents[last_item.pos()].ToString();
273 0 : reason = "unexpected token \"" + next_token + "\"";
274 : } else {
275 : reason = "unexpected end of input";
276 : }
277 0 : ReportError("Parser Error: " + reason);
278 : }
279 :
280 : // static
281 421244 : bool Grammar::MatchChar(int (*char_class)(int), InputPosition* pos) {
282 421244 : if (**pos && char_class(static_cast<unsigned char>(**pos))) {
283 229896 : ++*pos;
284 229896 : return true;
285 : }
286 : return false;
287 : }
288 :
289 : // static
290 63869 : bool Grammar::MatchChar(bool (*char_class)(char), InputPosition* pos) {
291 63870 : if (**pos && char_class(**pos)) {
292 62294 : ++*pos;
293 62293 : return true;
294 : }
295 : return false;
296 : }
297 :
298 : // static
299 410863 : bool Grammar::MatchString(const char* s, InputPosition* pos) {
300 410863 : InputPosition current = *pos;
301 415936 : for (; *s != 0; ++s, ++current) {
302 412462 : if (*s != *current) return false;
303 : }
304 3474 : *pos = current;
305 3474 : return true;
306 : }
307 :
308 : // static
309 1 : bool Grammar::MatchAnyChar(InputPosition* pos) {
310 2 : return MatchChar([](char c) { return true; }, pos);
311 : }
312 :
313 : } // namespace torque
314 : } // namespace internal
315 9114 : } // namespace v8
|