Line data Source code
1 : // Copyright 2018 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include <algorithm>
6 : #include <set>
7 : #include <unordered_map>
8 : #include <unordered_set>
9 :
10 : #include "src/torque/ast.h"
11 : #include "src/torque/earley-parser.h"
12 : #include "src/torque/utils.h"
13 :
14 : namespace v8 {
15 : namespace internal {
16 : namespace torque {
17 :
18 : namespace {
19 :
20 : void UpdateSourcePosition(InputPosition from, InputPosition to,
21 : SourcePosition* pos) {
22 326466 : while (from != to) {
23 284005 : if (*from == '\n') {
24 7934 : pos->line += 1;
25 7934 : pos->column = 0;
26 : } else {
27 276071 : pos->column += 1;
28 : }
29 284005 : ++from;
30 : }
31 : }
32 :
33 : } // namespace
34 :
35 359740 : base::Optional<ParseResult> Rule::RunAction(const Item* completed_item,
36 : const LexerResult& tokens) const {
37 : std::vector<ParseResult> results;
38 941736 : for (const Item* child : completed_item->Children()) {
39 264717 : if (!child) continue;
40 : base::Optional<ParseResult> child_result =
41 179845 : child->left()->RunAction(child, tokens);
42 179845 : if (child_result) results.push_back(std::move(*child_result));
43 : }
44 : MatchedInput matched_input = completed_item->GetMatchedInput(tokens);
45 179870 : CurrentSourcePosition::Scope pos_scope(matched_input.pos);
46 359740 : ParseResultIterator iterator(std::move(results), matched_input);
47 359740 : return action_(&iterator);
48 : }
49 :
50 6866 : Symbol& Symbol::operator=(std::initializer_list<Rule> rules) {
51 : rules_.clear();
52 14050 : for (const Rule& rule : rules) {
53 7184 : AddRule(rule);
54 : }
55 6866 : return *this;
56 : }
57 :
58 179895 : std::vector<const Item*> Item::Children() const {
59 : std::vector<const Item*> children;
60 402201 : for (const Item* current = this; current->prev_; current = current->prev_) {
61 222306 : children.push_back(current->child_);
62 : }
63 : // The above loop collects the child nodes in reversed order.
64 : std::reverse(children.begin(), children.end());
65 : DCHECK_EQ(children.size(), right().size());
66 179895 : return children;
67 : }
68 :
69 0 : std::string Item::SplitByChildren(const LexerResult& tokens) const {
70 0 : if (right().size() == 1) {
71 0 : if (const Item* child = Children()[0])
72 0 : return child->SplitByChildren(tokens);
73 : }
74 0 : std::stringstream s;
75 : bool first = true;
76 0 : for (const Item* item : Children()) {
77 0 : if (!item) continue;
78 0 : if (!first) s << " ";
79 : s << item->GetMatchedInput(tokens).ToString();
80 : first = false;
81 : }
82 0 : return s.str();
83 : }
84 :
85 715486 : void Item::CheckAmbiguity(const Item& other, const LexerResult& tokens) const {
86 : DCHECK(*this == other);
87 715486 : if (child_ != other.child_) {
88 0 : std::stringstream s;
89 0 : s << "Ambiguous grammer rules for \""
90 0 : << child_->GetMatchedInput(tokens).ToString() << "\":\n "
91 0 : << child_->SplitByChildren(tokens) << "\nvs\n "
92 0 : << other.child_->SplitByChildren(tokens);
93 0 : ReportError(s.str());
94 : }
95 715486 : if (prev_ != other.prev_) {
96 0 : std::stringstream s;
97 0 : s << "Ambiguous grammer rules for \"" << GetMatchedInput(tokens).ToString()
98 0 : << "\":\n " << SplitByChildren(tokens) << " ...\nvs\n "
99 0 : << other.SplitByChildren(tokens) << " ...";
100 0 : ReportError(s.str());
101 : }
102 715486 : }
103 :
104 25 : LexerResult Lexer::RunLexer(const std::string& input) {
105 : LexerResult result;
106 : InputPosition const begin = input.c_str();
107 25 : InputPosition const end = begin + input.size();
108 25 : InputPosition pos = begin;
109 : InputPosition token_start = pos;
110 : CurrentSourcePosition::Scope scope(
111 25 : SourcePosition{CurrentSourceFile::Get(), 0, 0});
112 25 : match_whitespace_(&pos);
113 42486 : while (pos != end) {
114 42436 : UpdateSourcePosition(token_start, pos, &CurrentSourcePosition::Get());
115 42436 : token_start = pos;
116 42436 : Symbol* symbol = MatchToken(&pos, end);
117 42436 : if (!symbol) {
118 0 : ReportError("Lexer Error: unknown token " +
119 : StringLiteralQuote(std::string(
120 : token_start, token_start + std::min<ptrdiff_t>(
121 0 : end - token_start, 10))));
122 : }
123 42436 : result.token_symbols.push_back(symbol);
124 : result.token_contents.push_back(
125 127308 : {token_start, pos, CurrentSourcePosition::Get()});
126 42436 : match_whitespace_(&pos);
127 : }
128 25 : UpdateSourcePosition(token_start, pos, &CurrentSourcePosition::Get());
129 : // Add an additional token position to simplify corner cases.
130 75 : result.token_contents.push_back({pos, pos, CurrentSourcePosition::Get()});
131 25 : return result;
132 : }
133 :
134 42436 : Symbol* Lexer::MatchToken(InputPosition* pos, InputPosition end) {
135 42436 : InputPosition token_start = *pos;
136 : Symbol* symbol = nullptr;
137 : // Find longest matching pattern.
138 296924 : for (std::pair<const PatternFunction, Symbol>& pair : patterns_) {
139 212052 : InputPosition token_end = token_start;
140 212052 : PatternFunction matchPattern = pair.first;
141 212052 : if (matchPattern(&token_end) && token_end > *pos) {
142 21003 : *pos = token_end;
143 21003 : symbol = &pair.second;
144 : }
145 : }
146 : // Check if matched pattern coincides with a keyword. Prefer the keyword in
147 : // this case.
148 42436 : if (*pos != token_start) {
149 20985 : auto found_keyword = keywords_.find(std::string(token_start, *pos));
150 20985 : if (found_keyword != keywords_.end()) {
151 5171 : return &found_keyword->second;
152 : }
153 : return symbol;
154 : }
155 : // Now check for a keyword (that doesn't overlap with a pattern).
156 : // Iterate from the end to ensure that if one keyword is a prefix of another,
157 : // we first try to match the longer one.
158 1423971 : for (auto it = keywords_.rbegin(); it != keywords_.rend(); ++it) {
159 1423971 : const std::string& keyword = it->first;
160 2847942 : if (static_cast<size_t>(end - *pos) < keyword.size()) continue;
161 4270827 : if (keyword == std::string(*pos, *pos + keyword.size())) {
162 21451 : *pos += keyword.size();
163 21451 : return &it->second;
164 : }
165 : }
166 : return nullptr;
167 : }
168 :
169 : // This is an implementation of Earley's parsing algorithm
170 : // (https://en.wikipedia.org/wiki/Earley_parser).
171 25 : const Item* RunEarleyAlgorithm(
172 : Symbol* start, const LexerResult& tokens,
173 : std::unordered_set<Item, base::hash<Item>>* processed) {
174 : // Worklist for items at the current position.
175 : std::vector<Item> worklist;
176 : // Worklist for items at the next position.
177 : std::vector<Item> future_items;
178 : CurrentSourcePosition::Scope source_position(
179 25 : SourcePosition{CurrentSourceFile::Get(), 0, 0});
180 : std::vector<const Item*> completed_items;
181 : std::unordered_map<std::pair<size_t, Symbol*>, std::set<const Item*>,
182 : base::hash<std::pair<size_t, Symbol*>>>
183 25 : waiting;
184 :
185 : std::vector<const Item*> debug_trace;
186 :
187 : // Start with one top_level symbol mapping to the start symbol of the grammar.
188 : // This simplifies things because the start symbol might have several
189 : // rules.
190 25 : Symbol top_level;
191 100 : top_level.AddRule(Rule({start}));
192 25 : worklist.push_back(Item{top_level.rule(0), 0, 0, 0});
193 :
194 1526840 : size_t input_length = tokens.token_symbols.size();
195 :
196 42486 : for (size_t pos = 0; pos <= input_length; ++pos) {
197 2566205 : while (!worklist.empty()) {
198 2523744 : auto insert_result = processed->insert(worklist.back());
199 2805187 : const Item& item = *insert_result.first;
200 : DCHECK_EQ(pos, item.pos());
201 5047488 : MatchedInput last_token = tokens.token_contents[pos];
202 2523744 : CurrentSourcePosition::Get() = last_token.pos;
203 2523744 : bool is_new = insert_result.second;
204 3239230 : if (!is_new) item.CheckAmbiguity(worklist.back(), tokens);
205 : worklist.pop_back();
206 2523744 : if (!is_new) continue;
207 :
208 3616516 : debug_trace.push_back(&item);
209 1808258 : if (item.IsComplete()) {
210 : // 'Complete' phase: Advance all items that were waiting to match this
211 : // symbol next.
212 1316090 : for (const Item* parent : waiting[{item.start(), item.left()}]) {
213 471761 : worklist.push_back(parent->Advance(pos, &item));
214 : }
215 : } else {
216 : Symbol* next = item.NextSymbol();
217 : // 'Scan' phase: Check if {next} is the next symbol in the input (this
218 : // is never the case if {next} is a non-terminal).
219 3052911 : if (pos < tokens.token_symbols.size() &&
220 1526096 : tokens.token_symbols[pos] == next) {
221 96608 : future_items.push_back(item.Advance(pos + 1, nullptr));
222 : }
223 : // 'Predict' phase: Add items for every rule of the non-terminal.
224 1526815 : if (!next->IsTerminal()) {
225 : // Remember that this item is waiting for completion with {next}.
226 1639644 : waiting[{pos, next}].insert(&item);
227 : }
228 5534123 : for (size_t i = 0; i < next->rule_number(); ++i) {
229 : Rule* rule = next->rule(i);
230 : auto already_completed =
231 4007308 : processed->find(Item{rule, rule->right().size(), pos, pos});
232 : // As discussed in section 3 of
233 : // Aycock, John, and R. Nigel Horspool. "Practical earley
234 : // parsing." The Computer Journal 45.6 (2002): 620-630.
235 : // Earley parsing has the following problem with epsilon rules:
236 : // When we complete an item that started at the current position
237 : // (that is, it matched zero tokens), we might not yet have
238 : // predicted all items it can complete with. Thus we check for the
239 : // existence of such items here and complete them immediately.
240 2003654 : if (already_completed != processed->end()) {
241 1314 : worklist.push_back(item.Advance(pos, &*already_completed));
242 : } else {
243 2002997 : worklist.push_back(Item{rule, 0, pos, pos});
244 : }
245 : }
246 : }
247 : }
248 : std::swap(worklist, future_items);
249 : }
250 :
251 : auto final_item =
252 25 : processed->find(Item{top_level.rule(0), 1, 0, input_length});
253 25 : if (final_item != processed->end()) {
254 : // Success: The {top_level} rule matches the complete input.
255 75 : return final_item->Children()[0];
256 : }
257 : std::string reason;
258 0 : const Item& last_item = *debug_trace.back();
259 0 : if (last_item.pos() < tokens.token_symbols.size()) {
260 0 : std::string next_token = tokens.token_contents[last_item.pos()].ToString();
261 0 : reason = "unexpected token \"" + next_token + "\"";
262 : } else {
263 : reason = "unexpected end of input";
264 : }
265 0 : ReportError("Parser Error: " + reason);
266 : }
267 :
268 : // static
269 378722 : bool Grammar::MatchChar(int (*char_class)(int), InputPosition* pos) {
270 378722 : if (**pos && char_class(static_cast<unsigned char>(**pos))) {
271 207612 : ++*pos;
272 207612 : return true;
273 : }
274 : return false;
275 : }
276 :
277 : // static
278 52322 : bool Grammar::MatchChar(bool (*char_class)(char), InputPosition* pos) {
279 52323 : if (**pos && char_class(**pos)) {
280 51009 : ++*pos;
281 51008 : return true;
282 : }
283 : return false;
284 : }
285 :
286 : // static
287 367519 : bool Grammar::MatchString(const char* s, InputPosition* pos) {
288 367519 : InputPosition current = *pos;
289 371539 : for (; *s != 0; ++s, ++current) {
290 368857 : if (*s != *current) return false;
291 : }
292 2682 : *pos = current;
293 2682 : return true;
294 : }
295 :
296 : // static
297 1 : bool Grammar::MatchAnyChar(InputPosition* pos) {
298 2 : return MatchChar([](char c) { return true; }, pos);
299 : }
300 :
301 : } // namespace torque
302 : } // namespace internal
303 9078 : } // namespace v8
|