Line data Source code
1 : // Copyright 2018 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include <algorithm>
6 : #include <set>
7 : #include <unordered_map>
8 : #include <unordered_set>
9 :
10 : #include "src/torque/ast.h"
11 : #include "src/torque/earley-parser.h"
12 : #include "src/torque/utils.h"
13 :
14 : namespace v8 {
15 : namespace internal {
16 : namespace torque {
17 :
18 : namespace {
19 :
20 : struct LineAndColumnTracker {
21 : LineAndColumn previous{0, 0};
22 : LineAndColumn current{0, 0};
23 :
24 : void Advance(InputPosition from, InputPosition to) {
25 : previous = current;
26 540357 : while (from != to) {
27 420707 : if (*from == '\n') {
28 11829 : current.line += 1;
29 : current.column = 0;
30 : } else {
31 408878 : current.column += 1;
32 : }
33 420707 : ++from;
34 : }
35 : }
36 :
37 : SourcePosition ToSourcePosition() {
38 59825 : return {CurrentSourceFile::Get(), previous, current};
39 : }
40 : };
41 :
42 : } // namespace
43 :
44 252841 : base::Optional<ParseResult> Rule::RunAction(const Item* completed_item,
45 : const LexerResult& tokens) const {
46 252841 : std::vector<ParseResult> results;
47 818207 : for (const Item* child : completed_item->Children()) {
48 372277 : if (!child) continue;
49 : base::Optional<ParseResult> child_result =
50 : child->left()->RunAction(child, tokens);
51 252773 : if (child_result) results.push_back(std::move(*child_result));
52 : }
53 : MatchedInput matched_input = completed_item->GetMatchedInput(tokens);
54 252841 : CurrentSourcePosition::Scope pos_scope(matched_input.pos);
55 505682 : ParseResultIterator iterator(std::move(results), matched_input);
56 505682 : return action_(&iterator);
57 : }
58 :
59 16617 : Symbol& Symbol::operator=(std::initializer_list<Rule> rules) {
60 16617 : rules_.clear();
61 52001 : for (const Rule& rule : rules) {
62 17692 : AddRule(rule);
63 : }
64 16617 : return *this;
65 : }
66 :
67 252909 : std::vector<const Item*> Item::Children() const {
68 : std::vector<const Item*> children;
69 878095 : for (const Item* current = this; current->prev_; current = current->prev_) {
70 312593 : children.push_back(current->child_);
71 : }
72 : // The above loop collects the child nodes in reversed order.
73 : std::reverse(children.begin(), children.end());
74 : DCHECK_EQ(children.size(), right().size());
75 252909 : return children;
76 : }
77 :
78 0 : std::string Item::SplitByChildren(const LexerResult& tokens) const {
79 0 : if (right().size() == 1) {
80 0 : if (const Item* child = Children()[0])
81 0 : return child->SplitByChildren(tokens);
82 : }
83 0 : std::stringstream s;
84 : bool first = true;
85 0 : for (const Item* item : Children()) {
86 0 : if (!item) continue;
87 0 : if (!first) s << " ";
88 0 : s << item->GetMatchedInput(tokens).ToString();
89 : first = false;
90 : }
91 : return s.str();
92 : }
93 :
94 1095059 : void Item::CheckAmbiguity(const Item& other, const LexerResult& tokens) const {
95 : DCHECK(*this == other);
96 1095059 : if (child_ != other.child_) {
97 0 : std::stringstream s;
98 : s << "Ambiguous grammer rules for \""
99 0 : << child_->GetMatchedInput(tokens).ToString() << "\":\n "
100 0 : << child_->SplitByChildren(tokens) << "\nvs\n "
101 0 : << other.child_->SplitByChildren(tokens);
102 0 : ReportError(s.str());
103 : }
104 1095059 : if (prev_ != other.prev_) {
105 0 : std::stringstream s;
106 0 : s << "Ambiguous grammer rules for \"" << GetMatchedInput(tokens).ToString()
107 0 : << "\":\n " << SplitByChildren(tokens) << " ...\nvs\n "
108 0 : << other.SplitByChildren(tokens) << " ...";
109 0 : ReportError(s.str());
110 : }
111 1095059 : }
112 :
113 70 : LexerResult Lexer::RunLexer(const std::string& input) {
114 : LexerResult result;
115 : InputPosition const begin = input.c_str();
116 70 : InputPosition const end = begin + input.size();
117 70 : InputPosition pos = begin;
118 : InputPosition token_start = pos;
119 : LineAndColumnTracker line_column_tracker;
120 :
121 70 : match_whitespace_(&pos);
122 70 : line_column_tracker.Advance(token_start, pos);
123 119580 : while (pos != end) {
124 : token_start = pos;
125 59756 : Symbol* symbol = MatchToken(&pos, end);
126 59756 : InputPosition token_end = pos;
127 : line_column_tracker.Advance(token_start, token_end);
128 59756 : if (!symbol) {
129 : CurrentSourcePosition::Scope pos_scope(
130 2 : line_column_tracker.ToSourcePosition());
131 2 : ReportError("Lexer Error: unknown token " +
132 4 : StringLiteralQuote(std::string(
133 1 : token_start, token_start + std::min<ptrdiff_t>(
134 3 : end - token_start, 10))));
135 : }
136 59755 : result.token_symbols.push_back(symbol);
137 179266 : result.token_contents.push_back(
138 : {token_start, pos, line_column_tracker.ToSourcePosition()});
139 59755 : match_whitespace_(&pos);
140 59755 : line_column_tracker.Advance(token_end, pos);
141 : }
142 :
143 : // Add an additional token position to simplify corner cases.
144 : line_column_tracker.Advance(token_start, pos);
145 139 : result.token_contents.push_back(
146 : {pos, pos, line_column_tracker.ToSourcePosition()});
147 69 : return result;
148 : }
149 :
150 59756 : Symbol* Lexer::MatchToken(InputPosition* pos, InputPosition end) {
151 59756 : InputPosition token_start = *pos;
152 : Symbol* symbol = nullptr;
153 : // Find longest matching pattern.
154 358306 : for (std::pair<const PatternFunction, Symbol>& pair : patterns_) {
155 298550 : InputPosition token_end = token_start;
156 298550 : PatternFunction matchPattern = pair.first;
157 298550 : if (matchPattern(&token_end) && token_end > *pos) {
158 29682 : *pos = token_end;
159 29682 : symbol = &pair.second;
160 : }
161 : }
162 : // Check if matched pattern coincides with a keyword. Prefer the keyword in
163 : // this case.
164 59756 : if (*pos != token_start) {
165 29664 : auto found_keyword = keywords_.find(std::string(token_start, *pos));
166 29664 : if (found_keyword != keywords_.end()) {
167 7538 : return &found_keyword->second;
168 : }
169 : return symbol;
170 : }
171 : // Now check for a keyword (that doesn't overlap with a pattern).
172 : // Iterate from the end to ensure that if one keyword is a prefix of another,
173 : // we first try to match the longer one.
174 1995013 : for (auto it = keywords_.rbegin(); it != keywords_.rend(); ++it) {
175 1995012 : const std::string& keyword = it->first;
176 1995012 : if (static_cast<size_t>(end - *pos) < keyword.size()) continue;
177 5983374 : if (keyword == std::string(*pos, *pos + keyword.size())) {
178 30091 : *pos += keyword.size();
179 30091 : return &it->second;
180 : }
181 : }
182 : return nullptr;
183 : }
184 :
185 : // This is an implementation of Earley's parsing algorithm
186 : // (https://en.wikipedia.org/wiki/Earley_parser).
187 69 : const Item* RunEarleyAlgorithm(
188 : Symbol* start, const LexerResult& tokens,
189 : std::unordered_set<Item, base::hash<Item>>* processed) {
190 : // Worklist for items at the current position.
191 : std::vector<Item> worklist;
192 : // Worklist for items at the next position.
193 : std::vector<Item> future_items;
194 : CurrentSourcePosition::Scope source_position(
195 70 : SourcePosition{CurrentSourceFile::Get(), {0, 0}, {0, 0}});
196 : std::vector<const Item*> completed_items;
197 : std::unordered_map<std::pair<size_t, Symbol*>, std::set<const Item*>,
198 : base::hash<std::pair<size_t, Symbol*>>>
199 : waiting;
200 :
201 : std::vector<const Item*> debug_trace;
202 :
203 : // Start with one top_level symbol mapping to the start symbol of the grammar.
204 : // This simplifies things because the start symbol might have several
205 : // rules.
206 : Symbol top_level;
207 276 : top_level.AddRule(Rule({start}));
208 70 : worklist.push_back(Item{top_level.rule(0), 0, 0, 0});
209 :
210 : size_t input_length = tokens.token_symbols.size();
211 :
212 119715 : for (size_t pos = 0; pos <= input_length; ++pos) {
213 3902793 : while (!worklist.empty()) {
214 : auto insert_result = processed->insert(worklist.back());
215 : const Item& item = *insert_result.first;
216 : DCHECK_EQ(pos, item.pos());
217 3842970 : MatchedInput last_token = tokens.token_contents[pos];
218 3842970 : CurrentSourcePosition::Get() = last_token.pos;
219 3842970 : bool is_new = insert_result.second;
220 4938029 : if (!is_new) item.CheckAmbiguity(worklist.back(), tokens);
221 : worklist.pop_back();
222 3842970 : if (!is_new) continue;
223 :
224 5495822 : debug_trace.push_back(&item);
225 2747911 : if (item.IsComplete()) {
226 : // 'Complete' phase: Advance all items that were waiting to match this
227 : // symbol next.
228 1544664 : for (const Item* parent : waiting[{item.start(), item.left()}]) {
229 694578 : worklist.push_back(parent->Advance(pos, &item));
230 : }
231 : } else {
232 : Symbol* next = item.NextSymbol();
233 : // 'Scan' phase: Check if {next} is the next symbol in the input (this
234 : // is never the case if {next} is a non-terminal).
235 4643840 : if (pos < tokens.token_symbols.size() &&
236 2320972 : tokens.token_symbols[pos] == next) {
237 136338 : future_items.push_back(item.Advance(pos + 1, nullptr));
238 : }
239 : // 'Predict' phase: Add items for every rule of the non-terminal.
240 2322868 : if (!next->IsTerminal()) {
241 : // Remember that this item is waiting for completion with {next}.
242 2327796 : waiting[{pos, next}].insert(&item);
243 : }
244 8483176 : for (size_t i = 0; i < next->rule_number(); ++i) {
245 : Rule* rule = next->rule(i);
246 : auto already_completed =
247 3080154 : processed->find(Item{rule, rule->right().size(), pos, pos});
248 : // As discussed in section 3 of
249 : // Aycock, John, and R. Nigel Horspool. "Practical earley
250 : // parsing." The Computer Journal 45.6 (2002): 620-630.
251 : // Earley parsing has the following problem with epsilon rules:
252 : // When we complete an item that started at the current position
253 : // (that is, it matched zero tokens), we might not yet have
254 : // predicted all items it can complete with. Thus we check for the
255 : // existence of such items here and complete them immediately.
256 3080154 : if (already_completed != processed->end()) {
257 2062 : worklist.push_back(item.Advance(pos, &*already_completed));
258 : } else {
259 3078092 : worklist.push_back(Item{rule, 0, pos, pos});
260 : }
261 : }
262 : }
263 : }
264 : std::swap(worklist, future_items);
265 : }
266 :
267 : auto final_item =
268 70 : processed->find(Item{top_level.rule(0), 1, 0, input_length});
269 69 : if (final_item != processed->end()) {
270 : // Success: The {top_level} rule matches the complete input.
271 204 : return final_item->Children()[0];
272 : }
273 : std::string reason;
274 1 : const Item& last_item = *debug_trace.back();
275 1 : if (last_item.pos() < tokens.token_symbols.size()) {
276 : std::string next_token = tokens.token_contents[last_item.pos()].ToString();
277 3 : reason = "unexpected token \"" + next_token + "\"";
278 : } else {
279 : reason = "unexpected end of input";
280 : }
281 2 : ReportError("Parser Error: " + reason);
282 : }
283 :
284 : // static
285 538534 : bool Grammar::MatchChar(int (*char_class)(int), InputPosition* pos) {
286 538534 : if (**pos && char_class(static_cast<unsigned char>(**pos))) {
287 296989 : ++*pos;
288 296989 : return true;
289 : }
290 : return false;
291 : }
292 :
293 : // static
294 89004 : bool Grammar::MatchChar(bool (*char_class)(char), InputPosition* pos) {
295 89007 : if (**pos && char_class(**pos)) {
296 86846 : ++*pos;
297 86843 : return true;
298 : }
299 : return false;
300 : }
301 :
302 : // static
303 518780 : bool Grammar::MatchString(const char* s, InputPosition* pos) {
304 518780 : InputPosition current = *pos;
305 532508 : for (; *s != 0; ++s, ++current) {
306 520939 : if (*s != *current) return false;
307 : }
308 4705 : *pos = current;
309 4705 : return true;
310 : }
311 :
312 : // static
313 3 : bool Grammar::MatchAnyChar(InputPosition* pos) {
314 6 : return MatchChar([](char c) { return true; }, pos);
315 : }
316 :
317 : } // namespace torque
318 : } // namespace internal
319 59456 : } // namespace v8
|