Line data Source code
1 : // Copyright 2018 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #ifndef V8_TORQUE_EARLEY_PARSER_H_
6 : #define V8_TORQUE_EARLEY_PARSER_H_
7 :
8 : #include <map>
9 : #include <vector>
10 :
11 : #include "src/base/optional.h"
12 : #include "src/torque/contextual.h"
13 : #include "src/torque/source-positions.h"
14 : #include "src/torque/utils.h"
15 :
16 : namespace v8 {
17 : namespace internal {
18 : namespace torque {
19 :
20 : class Symbol;
21 : class Item;
22 :
23 : class ParseResultHolderBase {
24 : public:
25 : enum class TypeId;
26 89979 : virtual ~ParseResultHolderBase() = default;
27 : template <class T>
28 : T& Cast();
29 : template <class T>
30 : const T& Cast() const;
31 :
32 : protected:
33 89979 : explicit ParseResultHolderBase(TypeId type_id) : type_id_(type_id) {
34 : // MSVC wrongly complains about type_id_ being an unused private field.
35 : USE(type_id_);
36 : }
37 :
38 : private:
39 : const TypeId type_id_;
40 : };
41 :
42 : using ParseResultTypeId = ParseResultHolderBase::TypeId;
43 :
44 : template <class T>
45 179958 : class ParseResultHolder : public ParseResultHolderBase {
46 : public:
47 3 : explicit ParseResultHolder(T value)
48 90006 : : ParseResultHolderBase(id), value_(std::move(value)) {}
49 :
50 : private:
51 : V8_EXPORT_PRIVATE static const TypeId id;
52 : friend class ParseResultHolderBase;
53 : T value_;
54 : };
55 :
56 : template <class T>
57 89979 : T& ParseResultHolderBase::Cast() {
58 89979 : CHECK_EQ(ParseResultHolder<T>::id, type_id_);
59 89979 : return static_cast<ParseResultHolder<T>*>(this)->value_;
60 : }
61 :
62 : template <class T>
63 : const T& ParseResultHolderBase::Cast() const {
64 : CHECK_EQ(ParseResultHolder<T>::id, type_id_);
65 : return static_cast<const ParseResultHolder<T>*>(this)->value_;
66 : }
67 :
68 : class ParseResult {
69 : public:
70 : template <class T>
71 204884 : explicit ParseResult(T x) : value_(new ParseResultHolder<T>(std::move(x))) {}
72 :
73 : template <class T>
74 : const T& Cast() const {
75 : return value_->Cast<T>();
76 : }
77 : template <class T>
78 : T& Cast() {
79 89979 : return value_->Cast<T>();
80 : }
81 :
82 : private:
83 : std::unique_ptr<ParseResultHolderBase> value_;
84 : };
85 :
86 : using InputPosition = const char*;
87 :
88 : struct MatchedInput {
89 : MatchedInput(InputPosition begin, InputPosition end, SourcePosition pos)
90 222331 : : begin(begin), end(end), pos(pos) {}
91 : InputPosition begin;
92 : InputPosition end;
93 : SourcePosition pos;
94 18069 : std::string ToString() const { return {begin, end}; }
95 : };
96 :
97 : class ParseResultIterator {
98 : public:
99 : explicit ParseResultIterator(std::vector<ParseResult> results,
100 : MatchedInput matched_input)
101 179870 : : results_(std::move(results)), matched_input_(matched_input) {}
102 359740 : ~ParseResultIterator() {
103 : // Check that all parse results have been used.
104 359740 : CHECK_EQ(results_.size(), i_);
105 179870 : }
106 :
107 179175 : ParseResult Next() {
108 358350 : CHECK_LT(i_, results_.size());
109 358350 : return std::move(results_[i_++]);
110 : }
111 : template <class T>
112 89977 : T NextAs() {
113 212937 : return std::move(Next().Cast<T>());
114 : }
115 90464 : bool HasNext() const { return i_ < results_.size(); }
116 :
117 : const MatchedInput& matched_input() const { return matched_input_; }
118 :
119 : private:
120 : std::vector<ParseResult> results_;
121 : size_t i_ = 0;
122 : MatchedInput matched_input_;
123 :
124 : DISALLOW_COPY_AND_ASSIGN(ParseResultIterator);
125 : };
126 :
127 50 : struct LexerResult {
128 : std::vector<Symbol*> token_symbols;
129 : std::vector<MatchedInput> token_contents;
130 : };
131 :
132 : using Action =
133 : base::Optional<ParseResult> (*)(ParseResultIterator* child_results);
134 :
135 89221 : inline base::Optional<ParseResult> DefaultAction(
136 : ParseResultIterator* child_results) {
137 89221 : if (!child_results->HasNext()) return base::nullopt;
138 178396 : return child_results->Next();
139 : }
140 :
141 : // A rule of the context-free grammar. Each rule can have an action attached to
142 : // it, which is executed after the parsing is finished.
143 8106 : class Rule final {
144 : public:
145 : explicit Rule(std::vector<Symbol*> right_hand_side,
146 : Action action = DefaultAction)
147 14671 : : right_hand_side_(std::move(right_hand_side)), action_(action) {}
148 :
149 : Symbol* left() const {
150 : DCHECK_NOT_NULL(left_hand_side_);
151 : return left_hand_side_;
152 : }
153 : const std::vector<Symbol*>& right() const { return right_hand_side_; }
154 :
155 : void SetLeftHandSide(Symbol* left_hand_side) {
156 : DCHECK_NULL(left_hand_side_);
157 8106 : left_hand_side_ = left_hand_side;
158 : }
159 :
160 : V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
161 : const Item* completed_item, const LexerResult& tokens) const;
162 :
163 : private:
164 : Symbol* left_hand_side_ = nullptr;
165 : std::vector<Symbol*> right_hand_side_;
166 : Action action_;
167 : };
168 :
169 : // A Symbol represents a terminal or a non-terminal of the grammar.
170 : // It stores the list of rules, which have this symbol as the
171 : // left-hand side.
172 : // Terminals have an empty list of rules, they are created by the Lexer
173 : // instead of from rules.
174 : // Symbols need to reside at stable memory addresses, because the addresses are
175 : // used in the parser.
176 5161 : class Symbol {
177 : public:
178 4524 : Symbol() : Symbol({}) {}
179 6245 : Symbol(std::initializer_list<Rule> rules) { *this = rules; }
180 :
181 : V8_EXPORT_PRIVATE Symbol& operator=(std::initializer_list<Rule> rules);
182 :
183 : bool IsTerminal() const { return rules_.empty(); }
184 50 : Rule* rule(size_t index) const { return rules_[index].get(); }
185 3530469 : size_t rule_number() const { return rules_.size(); }
186 :
187 8106 : void AddRule(const Rule& rule) {
188 16212 : rules_.push_back(base::make_unique<Rule>(rule));
189 : rules_.back()->SetLeftHandSide(this);
190 8106 : }
191 :
192 : V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
193 : const Item* item, const LexerResult& tokens);
194 :
195 : private:
196 : std::vector<std::unique_ptr<Rule>> rules_;
197 :
198 : // Disallow copying and moving to ensure Symbol has a stable address.
199 : DISALLOW_COPY_AND_ASSIGN(Symbol);
200 : };
201 :
202 : // Items are the core datastructure of Earley's algorithm.
203 : // They consist of a (partially) matched rule, a marked position inside of the
204 : // right-hand side of the rule (traditionally written as a dot) and an input
205 : // range from {start} to {pos} that matches the symbols of the right-hand side
206 : // that are left of the mark. In addition, they store a child and a left-sibling
207 : // pointer to reconstruct the AST in the end.
208 : class Item {
209 : public:
210 : Item(const Rule* rule, size_t mark, size_t start, size_t pos)
211 4527423 : : rule_(rule), mark_(mark), start_(start), pos_(pos) {
212 : DCHECK_LE(mark_, right().size());
213 : }
214 :
215 : // A complete item has the mark at the right end, which means the input range
216 : // matches the complete rule.
217 1808258 : bool IsComplete() const {
218 : DCHECK_LE(mark_, right().size());
219 3616516 : return mark_ == right().size();
220 : }
221 :
222 : // The symbol right after the mark is expected at {pos} for this item to
223 : // advance.
224 : Symbol* NextSymbol() const {
225 : DCHECK(!IsComplete());
226 : DCHECK_LT(mark_, right().size());
227 1526815 : return right()[mark_];
228 : }
229 :
230 : // We successfully parsed NextSymbol() between {pos} and {new_pos}.
231 : // If NextSymbol() was a non-terminal, then {child} is a pointer to a
232 : // completed item for this parse.
233 : // We create a new item, which moves the mark one forward.
234 : Item Advance(size_t new_pos, const Item* child = nullptr) const {
235 : if (child) {
236 : DCHECK(child->IsComplete());
237 : DCHECK_EQ(pos(), child->start());
238 : DCHECK_EQ(new_pos, child->pos());
239 : DCHECK_EQ(NextSymbol(), child->left());
240 : }
241 520065 : Item result(rule_, mark_ + 1, start_, new_pos);
242 520722 : result.prev_ = this;
243 472418 : result.child_ = child;
244 : return result;
245 : }
246 :
247 : // Collect the items representing the AST children of this completed item.
248 : std::vector<const Item*> Children() const;
249 : // The matched input separated according to the next branching AST level.
250 : std::string SplitByChildren(const LexerResult& tokens) const;
251 : // Check if {other} results in the same AST as this Item.
252 : void CheckAmbiguity(const Item& other, const LexerResult& tokens) const;
253 :
254 : MatchedInput GetMatchedInput(const LexerResult& tokens) const {
255 : return {tokens.token_contents[start_].begin,
256 : start_ == pos_ ? tokens.token_contents[start_].begin
257 152082 : : tokens.token_contents[pos_ - 1].end,
258 511822 : tokens.token_contents[start_].pos};
259 : }
260 :
261 : // We exclude {prev_} and {child_} from equality and hash computations,
262 : // because they are just globally unique data associated with an item.
263 : bool operator==(const Item& other) const {
264 2148504 : return rule_ == other.rule_ && mark_ == other.mark_ &&
265 2148504 : start_ == other.start_ && pos_ == other.pos_;
266 : }
267 :
268 4527423 : friend size_t hash_value(const Item& i) {
269 4527423 : return base::hash_combine(i.rule_, i.mark_, i.start_, i.pos_);
270 : }
271 :
272 : const Rule* rule() const { return rule_; }
273 461288 : Symbol* left() const { return rule_->left(); }
274 : const std::vector<Symbol*>& right() const { return rule_->right(); }
275 : size_t pos() const { return pos_; }
276 : size_t start() const { return start_; }
277 :
278 : private:
279 : const Rule* rule_;
280 : size_t mark_;
281 : size_t start_;
282 : size_t pos_;
283 :
284 : const Item* prev_ = nullptr;
285 : const Item* child_ = nullptr;
286 : };
287 :
288 179845 : inline base::Optional<ParseResult> Symbol::RunAction(
289 179870 : const Item* item, const LexerResult& tokens) {
290 : DCHECK(item->IsComplete());
291 : DCHECK_EQ(item->left(), this);
292 179870 : return item->rule()->RunAction(item, tokens);
293 : }
294 :
295 : V8_EXPORT_PRIVATE const Item* RunEarleyAlgorithm(
296 : Symbol* start, const LexerResult& tokens,
297 : std::unordered_set<Item, base::hash<Item>>* processed);
298 :
299 25 : inline base::Optional<ParseResult> ParseTokens(Symbol* start,
300 : const LexerResult& tokens) {
301 25 : std::unordered_set<Item, base::hash<Item>> table;
302 25 : const Item* final_item = RunEarleyAlgorithm(start, tokens, &table);
303 25 : return start->RunAction(final_item, tokens);
304 : }
305 :
306 : // The lexical syntax is dynamically defined while building the grammar by
307 : // adding patterns and keywords to the Lexer.
308 : // The term keyword here can stand for any fixed character sequence, including
309 : // operators and parentheses.
310 : // Each pattern or keyword automatically gets a terminal symbol associated with
311 : // it. These symbols form the result of the lexing.
312 : // Patterns and keywords are matched using the longest match principle. If the
313 : // longest matching pattern coincides with a keyword, the keyword symbol is
314 : // chosen instead of the pattern.
315 : // In addition, there is a single whitespace pattern which is consumed but does
316 : // not become part of the token list.
317 72 : class Lexer {
318 : public:
319 : // Functions to define patterns. They try to match starting from {pos}. If
320 : // successful, they return true and advance {pos}. Otherwise, {pos} stays
321 : // unchanged.
322 : using PatternFunction = bool (*)(InputPosition* pos);
323 :
324 : void SetWhitespace(PatternFunction whitespace) {
325 24 : match_whitespace_ = whitespace;
326 : }
327 :
328 116 : Symbol* Pattern(PatternFunction pattern) { return &patterns_[pattern]; }
329 5042 : Symbol* Token(const std::string& keyword) { return &keywords_[keyword]; }
330 : V8_EXPORT_PRIVATE LexerResult RunLexer(const std::string& input);
331 :
332 : private:
333 0 : PatternFunction match_whitespace_ = [](InputPosition*) { return false; };
334 : std::map<PatternFunction, Symbol> patterns_;
335 : std::map<std::string, Symbol> keywords_;
336 : Symbol* MatchToken(InputPosition* pos, InputPosition end);
337 : };
338 :
339 : // A grammar can have a result, which is the results of the start symbol.
340 : // Grammar is intended to be subclassed, with Symbol members forming the
341 : // mutually recursive rules of the grammar.
342 24 : class Grammar {
343 : public:
344 : using PatternFunction = Lexer::PatternFunction;
345 :
346 24 : explicit Grammar(Symbol* start) : start_(start) {}
347 :
348 25 : base::Optional<ParseResult> Parse(const std::string& input) {
349 25 : LexerResult tokens = lexer().RunLexer(input);
350 25 : return ParseTokens(start_, tokens);
351 : }
352 :
353 : protected:
354 : Symbol* Token(const std::string& s) { return lexer_.Token(s); }
355 : Symbol* Pattern(PatternFunction pattern) { return lexer_.Pattern(pattern); }
356 : void SetWhitespace(PatternFunction ws) { lexer_.SetWhitespace(ws); }
357 :
358 : // NewSymbol() allocates a fresh symbol and stores it in the current grammar.
359 : // This is necessary to define helpers that create new symbols.
360 2875 : Symbol* NewSymbol(std::initializer_list<Rule> rules = {}) {
361 2875 : Symbol* result = new Symbol(rules);
362 5750 : generated_symbols_.push_back(std::unique_ptr<Symbol>(result));
363 2875 : return result;
364 : }
365 :
366 : // Helper functions to define lexer patterns. If they match, they return true
367 : // and advance {pos}. Otherwise, {pos} is unchanged.
368 : V8_EXPORT_PRIVATE static bool MatchChar(int (*char_class)(int),
369 : InputPosition* pos);
370 : V8_EXPORT_PRIVATE static bool MatchChar(bool (*char_class)(char),
371 : InputPosition* pos);
372 : V8_EXPORT_PRIVATE static bool MatchAnyChar(InputPosition* pos);
373 : V8_EXPORT_PRIVATE static bool MatchString(const char* s, InputPosition* pos);
374 :
375 : // The action MatchInput() produces the input matched by the rule as
376 : // result.
377 17866 : static base::Optional<ParseResult> YieldMatchedInput(
378 : ParseResultIterator* child_results) {
379 71464 : return ParseResult{child_results->matched_input().ToString()};
380 : }
381 :
382 : // Create a new symbol to parse the given sequence of symbols.
383 : // At most one of the symbols can return a result.
384 483 : Symbol* Sequence(std::vector<Symbol*> symbols) {
385 966 : return NewSymbol({Rule(std::move(symbols))});
386 : }
387 :
388 : template <class T, T value>
389 6521 : static base::Optional<ParseResult> YieldIntegralConstant(
390 : ParseResultIterator* child_results) {
391 6521 : return ParseResult{value};
392 : }
393 :
394 : template <class T>
395 22059 : static base::Optional<ParseResult> YieldDefaultValue(
396 : ParseResultIterator* child_results) {
397 54681 : return ParseResult{T{}};
398 : }
399 :
400 : template <class From, class To>
401 11506 : static base::Optional<ParseResult> CastParseResult(
402 : ParseResultIterator* child_results) {
403 16782 : To result = std::move(child_results->NextAs<From>());
404 22431 : return ParseResult{std::move(result)};
405 : }
406 :
407 : // Try to parse {s} and return the result of type {Result} casted to {T}.
408 : // Otherwise, the result is a default-constructed {T}.
409 : template <class T, class Result = T>
410 1173 : Symbol* TryOrDefault(Symbol* s) {
411 : return NewSymbol({Rule({s}, CastParseResult<Result, T>),
412 4692 : Rule({}, YieldDefaultValue<T>)});
413 : }
414 :
415 : template <class T>
416 4385 : static base::Optional<ParseResult> MakeSingletonVector(
417 : ParseResultIterator* child_results) {
418 4385 : T x = child_results->NextAs<T>();
419 786 : std::vector<T> result;
420 : result.push_back(std::move(x));
421 16754 : return ParseResult{std::move(result)};
422 : }
423 :
424 : template <class T>
425 4638 : static base::Optional<ParseResult> MakeExtendedVector(
426 : ParseResultIterator* child_results) {
427 4638 : std::vector<T> l = child_results->NextAs<std::vector<T>>();
428 4659 : T x = child_results->NextAs<T>();
429 : l.push_back(std::move(x));
430 17908 : return ParseResult{std::move(l)};
431 : }
432 :
433 : // For example, NonemptyList(Token("A"), Token(",")) parses any of
434 : // A or A,A or A,A,A and so on.
435 : template <class T>
436 506 : Symbol* NonemptyList(Symbol* element,
437 506 : base::Optional<Symbol*> separator = {}) {
438 506 : Symbol* list = NewSymbol();
439 3312 : *list = {Rule({element}, MakeSingletonVector<T>),
440 : separator
441 : ? Rule({list, *separator, element}, MakeExtendedVector<T>)
442 : : Rule({list, element}, MakeExtendedVector<T>)};
443 506 : return list;
444 : }
445 :
446 : template <class T>
447 414 : Symbol* List(Symbol* element, base::Optional<Symbol*> separator = {}) {
448 414 : return TryOrDefault<std::vector<T>>(NonemptyList<T>(element, separator));
449 : }
450 :
451 : template <class T>
452 : Symbol* Optional(Symbol* x) {
453 460 : return TryOrDefault<base::Optional<T>, T>(x);
454 : }
455 :
456 322 : Symbol* CheckIf(Symbol* x) {
457 : return NewSymbol({Rule({x}, YieldIntegralConstant<bool, true>),
458 1288 : Rule({}, YieldIntegralConstant<bool, false>)});
459 : }
460 :
461 : Lexer& lexer() { return lexer_; }
462 :
463 : private:
464 : Lexer lexer_;
465 : std::vector<std::unique_ptr<Symbol>> generated_symbols_;
466 : Symbol* start_;
467 : };
468 :
469 : } // namespace torque
470 : } // namespace internal
471 : } // namespace v8
472 :
473 : #endif // V8_TORQUE_EARLEY_PARSER_H_
|