Line data Source code
1 : // Copyright 2018 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #ifndef V8_TORQUE_EARLEY_PARSER_H_
6 : #define V8_TORQUE_EARLEY_PARSER_H_
7 :
8 : #include <map>
9 : #include <vector>
10 :
11 : #include "src/base/optional.h"
12 : #include "src/torque/contextual.h"
13 : #include "src/torque/source-positions.h"
14 : #include "src/torque/utils.h"
15 :
16 : namespace v8 {
17 : namespace internal {
18 : namespace torque {
19 :
20 : class Symbol;
21 : class Item;
22 :
23 : class ParseResultHolderBase {
24 : public:
25 : enum class TypeId;
26 130706 : virtual ~ParseResultHolderBase() = default;
27 : template <class T>
28 : T& Cast();
29 : template <class T>
30 : const T& Cast() const;
31 :
32 : protected:
33 130706 : explicit ParseResultHolderBase(TypeId type_id) : type_id_(type_id) {
34 : // MSVC wrongly complains about type_id_ being an unused private field.
35 : USE(type_id_);
36 : }
37 :
38 : private:
39 : const TypeId type_id_;
40 : };
41 :
42 : enum class ParseResultHolderBase::TypeId {
43 : kStdString,
44 : kBool,
45 : kStdVectorOfString,
46 : kExpressionPtr,
47 : kIdentifierPtr,
48 : kOptionalIdentifierPtr,
49 : kStatementPtr,
50 : kDeclarationPtr,
51 : kTypeExpressionPtr,
52 : kOptionalTypeExpressionPtr,
53 : kLabelBlockPtr,
54 : kOptionalLabelBlockPtr,
55 : kNameAndTypeExpression,
56 : kNameAndExpression,
57 : kClassFieldExpression,
58 : kStructFieldExpression,
59 : kStdVectorOfNameAndTypeExpression,
60 : kStdVectorOfNameAndExpression,
61 : kStdVectorOfClassFieldExpression,
62 : kStdVectorOfStructFieldExpression,
63 : kIncrementDecrementOperator,
64 : kOptionalStdString,
65 : kStdVectorOfStatementPtr,
66 : kStdVectorOfDeclarationPtr,
67 : kStdVectorOfExpressionPtr,
68 : kExpressionWithSource,
69 : kParameterList,
70 : kRangeExpression,
71 : kOptionalRangeExpression,
72 : kTypeList,
73 : kOptionalTypeList,
74 : kLabelAndTypes,
75 : kStdVectorOfLabelAndTypes,
76 : kStdVectorOfLabelBlockPtr,
77 : kOptionalStatementPtr,
78 : kOptionalExpressionPtr,
79 : kTypeswitchCase,
80 : kStdVectorOfTypeswitchCase,
81 : kStdVectorOfIdentifierPtr,
82 :
83 : kJsonValue,
84 : kJsonMember,
85 : kStdVectorOfJsonValue,
86 : kStdVectorOfJsonMember,
87 : };
88 :
89 : using ParseResultTypeId = ParseResultHolderBase::TypeId;
90 :
91 : template <class T>
92 261412 : class ParseResultHolder : public ParseResultHolderBase {
93 : public:
94 171 : explicit ParseResultHolder(T value)
95 130937 : : ParseResultHolderBase(id), value_(std::move(value)) {}
96 :
97 : private:
98 : V8_EXPORT_PRIVATE static const TypeId id;
99 : friend class ParseResultHolderBase;
100 : T value_;
101 : };
102 :
103 : template <class T>
104 : T& ParseResultHolderBase::Cast() {
105 130706 : CHECK_EQ(ParseResultHolder<T>::id, type_id_);
106 1618 : return static_cast<ParseResultHolder<T>*>(this)->value_;
107 : }
108 :
109 : template <class T>
110 : const T& ParseResultHolderBase::Cast() const {
111 : CHECK_EQ(ParseResultHolder<T>::id, type_id_);
112 : return static_cast<const ParseResultHolder<T>*>(this)->value_;
113 : }
114 :
115 917713 : class ParseResult {
116 : public:
117 : template <class T>
118 283877 : explicit ParseResult(T x) : value_(new ParseResultHolder<T>(std::move(x))) {}
119 :
120 : template <class T>
121 : const T& Cast() const& {
122 : return value_->Cast<T>();
123 : }
124 : template <class T>
125 : T& Cast() & {
126 : return value_->Cast<T>();
127 : }
128 : template <class T>
129 : T&& Cast() && {
130 : return std::move(value_->Cast<T>());
131 : }
132 :
133 : private:
134 : std::unique_ptr<ParseResultHolderBase> value_;
135 : };
136 :
137 : using InputPosition = const char*;
138 :
139 : struct MatchedInput {
140 : MatchedInput(InputPosition begin, InputPosition end, SourcePosition pos)
141 312665 : : begin(begin), end(end), pos(pos) {}
142 : InputPosition begin;
143 : InputPosition end;
144 : SourcePosition pos;
145 : std::string ToString() const { return {begin, end}; }
146 : };
147 :
148 : class ParseResultIterator {
149 : public:
150 : explicit ParseResultIterator(std::vector<ParseResult> results,
151 : MatchedInput matched_input)
152 252841 : : results_(std::move(results)), matched_input_(matched_input) {}
153 505682 : ~ParseResultIterator() {
154 : // Check that all parse results have been used.
155 505682 : CHECK_EQ(results_.size(), i_);
156 252841 : }
157 :
158 251803 : ParseResult Next() {
159 251803 : CHECK_LT(i_, results_.size());
160 503606 : return std::move(results_[i_++]);
161 : }
162 : template <class T>
163 130692 : T NextAs() {
164 318002 : return std::move(Next().Cast<T>());
165 : }
166 122951 : bool HasNext() const { return i_ < results_.size(); }
167 :
168 : const MatchedInput& matched_input() const { return matched_input_; }
169 :
170 : private:
171 : std::vector<ParseResult> results_;
172 : size_t i_ = 0;
173 : MatchedInput matched_input_;
174 :
175 : DISALLOW_COPY_AND_ASSIGN(ParseResultIterator);
176 : };
177 :
178 140 : struct LexerResult {
179 : std::vector<Symbol*> token_symbols;
180 : std::vector<MatchedInput> token_contents;
181 : };
182 :
183 : using Action =
184 : base::Optional<ParseResult> (*)(ParseResultIterator* child_results);
185 :
186 121169 : inline base::Optional<ParseResult> DefaultAction(
187 : ParseResultIterator* child_results) {
188 121169 : if (!child_results->HasNext()) return base::nullopt;
189 242222 : return child_results->Next();
190 : }
191 :
192 : // A rule of the context-free grammar. Each rule can have an action attached to
193 : // it, which is executed after the parsing is finished.
194 59763 : class Rule final {
195 : public:
196 : explicit Rule(std::vector<Symbol*> right_hand_side,
197 : Action action = DefaultAction)
198 35980 : : right_hand_side_(std::move(right_hand_side)), action_(action) {}
199 :
200 : Symbol* left() const {
201 : DCHECK_NOT_NULL(left_hand_side_);
202 : return left_hand_side_;
203 : }
204 : const std::vector<Symbol*>& right() const { return right_hand_side_; }
205 :
206 : void SetLeftHandSide(Symbol* left_hand_side) {
207 : DCHECK_NULL(left_hand_side_);
208 19921 : left_hand_side_ = left_hand_side;
209 : }
210 :
211 : V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
212 : const Item* completed_item, const LexerResult& tokens) const;
213 :
214 : private:
215 : Symbol* left_hand_side_ = nullptr;
216 : std::vector<Symbol*> right_hand_side_;
217 : Action action_;
218 : };
219 :
220 : // A Symbol represents a terminal or a non-terminal of the grammar.
221 : // It stores the list of rules, which have this symbol as the
222 : // left-hand side.
223 : // Terminals have an empty list of rules, they are created by the Lexer
224 : // instead of from rules.
225 : // Symbols need to reside at stable memory addresses, because the addresses are
226 : // used in the parser.
227 12518 : class Symbol {
228 : public:
229 5905 : Symbol() : Symbol({}) {}
230 30104 : Symbol(std::initializer_list<Rule> rules) { *this = rules; }
231 :
232 : V8_EXPORT_PRIVATE Symbol& operator=(std::initializer_list<Rule> rules);
233 :
234 : bool IsTerminal() const { return rules_.empty(); }
235 : Rule* rule(size_t index) const { return rules_[index].get(); }
236 : size_t rule_number() const { return rules_.size(); }
237 :
238 19921 : void AddRule(const Rule& rule) {
239 39842 : rules_.push_back(base::make_unique<Rule>(rule));
240 : rules_.back()->SetLeftHandSide(this);
241 19921 : }
242 :
243 : V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
244 : const Item* item, const LexerResult& tokens);
245 :
246 : private:
247 : std::vector<std::unique_ptr<Rule>> rules_;
248 :
249 : // Disallow copying and moving to ensure Symbol has a stable address.
250 : DISALLOW_COPY_AND_ASSIGN(Symbol);
251 : };
252 :
253 : // Items are the core datastructure of Earley's algorithm.
254 : // They consist of a (partially) matched rule, a marked position inside of the
255 : // right-hand side of the rule (traditionally written as a dot) and an input
256 : // range from {start} to {pos} that matches the symbols of the right-hand side
257 : // that are left of the mark. In addition, they store a child and a left-sibling
258 : // pointer to reconstruct the AST in the end.
259 : class Item {
260 : public:
261 : Item(const Rule* rule, size_t mark, size_t start, size_t pos)
262 6923193 : : rule_(rule), mark_(mark), start_(start), pos_(pos) {
263 : DCHECK_LE(mark_, right().size());
264 : }
265 :
266 : // A complete item has the mark at the right end, which means the input range
267 : // matches the complete rule.
268 : bool IsComplete() const {
269 : DCHECK_LE(mark_, right().size());
270 2747911 : return mark_ == right().size();
271 : }
272 :
273 : // The symbol right after the mark is expected at {pos} for this item to
274 : // advance.
275 : Symbol* NextSymbol() const {
276 : DCHECK(!IsComplete());
277 : DCHECK_LT(mark_, right().size());
278 2322868 : return right()[mark_];
279 : }
280 :
281 : // We successfully parsed NextSymbol() between {pos} and {new_pos}.
282 : // If NextSymbol() was a non-terminal, then {child} is a pointer to a
283 : // completed item for this parse.
284 : // We create a new item, which moves the mark one forward.
285 : Item Advance(size_t new_pos, const Item* child = nullptr) const {
286 : if (child) {
287 : DCHECK(child->IsComplete());
288 : DCHECK_EQ(pos(), child->start());
289 : DCHECK_EQ(new_pos, child->pos());
290 : DCHECK_EQ(NextSymbol(), child->left());
291 : }
292 764809 : Item result(rule_, mark_ + 1, start_, new_pos);
293 764809 : result.prev_ = this;
294 696640 : result.child_ = child;
295 : return result;
296 : }
297 :
298 : // Collect the items representing the AST children of this completed item.
299 : std::vector<const Item*> Children() const;
300 : // The matched input separated according to the next branching AST level.
301 : std::string SplitByChildren(const LexerResult& tokens) const;
302 : // Check if {other} results in the same AST as this Item.
303 : void CheckAmbiguity(const Item& other, const LexerResult& tokens) const;
304 :
305 : MatchedInput GetMatchedInput(const LexerResult& tokens) const {
306 252841 : return {tokens.token_contents[start_].begin,
307 : start_ == pos_ ? tokens.token_contents[start_].begin
308 214669 : : tokens.token_contents[pos_ - 1].end,
309 467510 : tokens.token_contents[start_].pos};
310 : }
311 :
312 : // We exclude {prev_} and {child_} from equality and hash computations,
313 : // because they are just globally unique data associated with an item.
314 : bool operator==(const Item& other) const {
315 3291567 : return rule_ == other.rule_ && mark_ == other.mark_ &&
316 3291567 : start_ == other.start_ && pos_ == other.pos_;
317 : }
318 :
319 6923193 : friend size_t hash_value(const Item& i) {
320 6923193 : return base::hash_combine(i.rule_, i.mark_, i.start_, i.pos_);
321 : }
322 :
323 252773 : const Rule* rule() const { return rule_; }
324 : Symbol* left() const { return rule_->left(); }
325 : const std::vector<Symbol*>& right() const { return rule_->right(); }
326 : size_t pos() const { return pos_; }
327 : size_t start() const { return start_; }
328 :
329 : private:
330 : const Rule* rule_;
331 : size_t mark_;
332 : size_t start_;
333 : size_t pos_;
334 :
335 : const Item* prev_ = nullptr;
336 : const Item* child_ = nullptr;
337 : };
338 :
339 : inline base::Optional<ParseResult> Symbol::RunAction(
340 : const Item* item, const LexerResult& tokens) {
341 : DCHECK(item->IsComplete());
342 : DCHECK_EQ(item->left(), this);
343 252841 : return item->rule()->RunAction(item, tokens);
344 : }
345 :
346 : V8_EXPORT_PRIVATE const Item* RunEarleyAlgorithm(
347 : Symbol* start, const LexerResult& tokens,
348 : std::unordered_set<Item, base::hash<Item>>* processed);
349 :
350 69 : inline base::Optional<ParseResult> ParseTokens(Symbol* start,
351 : const LexerResult& tokens) {
352 : std::unordered_set<Item, base::hash<Item>> table;
353 69 : const Item* final_item = RunEarleyAlgorithm(start, tokens, &table);
354 68 : return start->RunAction(final_item, tokens);
355 : }
356 :
357 : // The lexical syntax is dynamically defined while building the grammar by
358 : // adding patterns and keywords to the Lexer.
359 : // The term keyword here can stand for any fixed character sequence, including
360 : // operators and parentheses.
361 : // Each pattern or keyword automatically gets a terminal symbol associated with
362 : // it. These symbols form the result of the lexing.
363 : // Patterns and keywords are matched using the longest match principle. If the
364 : // longest matching pattern coincides with a keyword, the keyword symbol is
365 : // chosen instead of the pattern.
366 : // In addition, there is a single whitespace pattern which is consumed but does
367 : // not become part of the token list.
368 207 : class Lexer {
369 : public:
370 : // Functions to define patterns. They try to match starting from {pos}. If
371 : // successful, they return true and advance {pos}. Otherwise, {pos} stays
372 : // unchanged.
373 : using PatternFunction = bool (*)(InputPosition* pos);
374 :
375 : void SetWhitespace(PatternFunction whitespace) {
376 69 : match_whitespace_ = whitespace;
377 : }
378 :
379 313 : Symbol* Pattern(PatternFunction pattern) { return &patterns_[pattern]; }
380 12349 : Symbol* Token(const std::string& keyword) { return &keywords_[keyword]; }
381 : V8_EXPORT_PRIVATE LexerResult RunLexer(const std::string& input);
382 :
383 : private:
384 0 : PatternFunction match_whitespace_ = [](InputPosition*) { return false; };
385 : std::map<PatternFunction, Symbol> patterns_;
386 : std::map<std::string, Symbol> keywords_;
387 : Symbol* MatchToken(InputPosition* pos, InputPosition end);
388 : };
389 :
390 : // A grammar can have a result, which is the results of the start symbol.
391 : // Grammar is intended to be subclassed, with Symbol members forming the
392 : // mutually recursive rules of the grammar.
393 69 : class Grammar {
394 : public:
395 : using PatternFunction = Lexer::PatternFunction;
396 :
397 69 : explicit Grammar(Symbol* start) : start_(start) {}
398 :
399 70 : base::Optional<ParseResult> Parse(const std::string& input) {
400 139 : LexerResult tokens = lexer().RunLexer(input);
401 137 : return ParseTokens(start_, tokens);
402 : }
403 :
404 : protected:
405 : Symbol* Token(const std::string& s) { return lexer_.Token(s); }
406 : Symbol* Pattern(PatternFunction pattern) { return lexer_.Pattern(pattern); }
407 : void SetWhitespace(PatternFunction ws) { lexer_.SetWhitespace(ws); }
408 :
409 : // NewSymbol() allocates a fresh symbol and stores it in the current grammar.
410 : // This is necessary to define helpers that create new symbols.
411 6914 : Symbol* NewSymbol(std::initializer_list<Rule> rules = {}) {
412 6914 : Symbol* result = new Symbol(rules);
413 13828 : generated_symbols_.push_back(std::unique_ptr<Symbol>(result));
414 6914 : return result;
415 : }
416 :
417 : // Helper functions to define lexer patterns. If they match, they return true
418 : // and advance {pos}. Otherwise, {pos} is unchanged.
419 : V8_EXPORT_PRIVATE static bool MatchChar(int (*char_class)(int),
420 : InputPosition* pos);
421 : V8_EXPORT_PRIVATE static bool MatchChar(bool (*char_class)(char),
422 : InputPosition* pos);
423 : V8_EXPORT_PRIVATE static bool MatchAnyChar(InputPosition* pos);
424 : V8_EXPORT_PRIVATE static bool MatchString(const char* s, InputPosition* pos);
425 :
426 : // The action MatchInput() produces the input matched by the rule as
427 : // result.
428 24868 : static base::Optional<ParseResult> YieldMatchedInput(
429 : ParseResultIterator* child_results) {
430 74604 : return ParseResult{child_results->matched_input().ToString()};
431 : }
432 :
433 : // Create a new symbol to parse the given sequence of symbols.
434 : // At most one of the symbols can return a result.
435 1134 : Symbol* Sequence(std::vector<Symbol*> symbols) {
436 2268 : return NewSymbol({Rule(std::move(symbols))});
437 : }
438 :
439 : template <class T, T value>
440 9994 : static base::Optional<ParseResult> YieldIntegralConstant(
441 : ParseResultIterator* child_results) {
442 9994 : return ParseResult{value};
443 : }
444 :
445 : template <class T>
446 29492 : static base::Optional<ParseResult> YieldDefaultValue(
447 : ParseResultIterator* child_results) {
448 72822 : return ParseResult{T{}};
449 : }
450 :
451 : template <class From, class To>
452 9828 : static base::Optional<ParseResult> CastParseResult(
453 : ParseResultIterator* child_results) {
454 17749 : To result = std::move(child_results->NextAs<From>());
455 27658 : return ParseResult{std::move(result)};
456 : }
457 :
458 : // Try to parse {s} and return the result of type {Result} casted to {T}.
459 : // Otherwise, the result is a default-constructed {T}.
460 : template <class T, class Result = T>
461 2782 : Symbol* TryOrDefault(Symbol* s) {
462 19474 : return NewSymbol({Rule({s}, CastParseResult<Result, T>),
463 5564 : Rule({}, YieldDefaultValue<T>)});
464 : }
465 :
466 : template <class T>
467 6240 : static base::Optional<ParseResult> MakeSingletonVector(
468 : ParseResultIterator* child_results) {
469 6395 : T x = child_results->NextAs<T>();
470 374 : std::vector<T> result;
471 : result.push_back(std::move(x));
472 24586 : return ParseResult{std::move(result)};
473 : }
474 :
475 : template <class T>
476 6063 : static base::Optional<ParseResult> MakeExtendedVector(
477 : ParseResultIterator* child_results) {
478 6348 : std::vector<T> l = child_results->NextAs<std::vector<T>>();
479 6090 : T x = child_results->NextAs<T>();
480 : l.push_back(std::move(x));
481 23967 : return ParseResult{std::move(l)};
482 : }
483 :
484 : // For example, NonemptyList(Token("A"), Token(",")) parses any of
485 : // A or A,A or A,A,A and so on.
486 : template <class T>
487 1108 : Symbol* NonemptyList(Symbol* element,
488 : base::Optional<Symbol*> separator = {}) {
489 1108 : Symbol* list = NewSymbol();
490 7242 : *list = {Rule({element}, MakeSingletonVector<T>),
491 : separator
492 : ? Rule({list, *separator, element}, MakeExtendedVector<T>)
493 : : Rule({list, element}, MakeExtendedVector<T>)};
494 1108 : return list;
495 : }
496 :
497 : template <class T>
498 : Symbol* List(Symbol* element, base::Optional<Symbol*> separator = {}) {
499 892 : return TryOrDefault<std::vector<T>>(NonemptyList<T>(element, separator));
500 : }
501 :
502 : template <class T>
503 : Symbol* Optional(Symbol* x) {
504 1188 : return TryOrDefault<base::Optional<T>, T>(x);
505 : }
506 :
507 972 : Symbol* CheckIf(Symbol* x) {
508 6804 : return NewSymbol({Rule({x}, YieldIntegralConstant<bool, true>),
509 1944 : Rule({}, YieldIntegralConstant<bool, false>)});
510 : }
511 :
512 70 : Lexer& lexer() { return lexer_; }
513 :
514 : private:
515 : Lexer lexer_;
516 : std::vector<std::unique_ptr<Symbol>> generated_symbols_;
517 : Symbol* start_;
518 : };
519 :
520 : } // namespace torque
521 : } // namespace internal
522 : } // namespace v8
523 :
524 : #endif // V8_TORQUE_EARLEY_PARSER_H_
|