Line data Source code
1 : // Copyright 2018 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #ifndef V8_TORQUE_EARLEY_PARSER_H_
6 : #define V8_TORQUE_EARLEY_PARSER_H_
7 :
8 : #include <map>
9 : #include <vector>
10 :
11 : #include "src/base/optional.h"
12 : #include "src/torque/contextual.h"
13 : #include "src/torque/source-positions.h"
14 : #include "src/torque/utils.h"
15 :
16 : namespace v8 {
17 : namespace internal {
18 : namespace torque {
19 :
20 : class Symbol;
21 : class Item;
22 :
23 : class ParseResultHolderBase {
24 : public:
25 : enum class TypeId;
26 131368 : virtual ~ParseResultHolderBase() = default;
27 : template <class T>
28 : T& Cast();
29 : template <class T>
30 : const T& Cast() const;
31 :
32 : protected:
33 131368 : explicit ParseResultHolderBase(TypeId type_id) : type_id_(type_id) {
34 : // MSVC wrongly complains about type_id_ being an unused private field.
35 : USE(type_id_);
36 : }
37 :
38 : private:
39 : const TypeId type_id_;
40 : };
41 :
42 : enum class ParseResultHolderBase::TypeId {
43 : kStdString,
44 : kBool,
45 : kStdVectorOfString,
46 : kExpressionPtr,
47 : kIdentifierPtr,
48 : kOptionalIdentifierPtr,
49 : kLocationExpressionPtr,
50 : kStatementPtr,
51 : kDeclarationPtr,
52 : kTypeExpressionPtr,
53 : kOptionalTypeExpressionPtr,
54 : kLabelBlockPtr,
55 : kOptionalLabelBlockPtr,
56 : kNameAndTypeExpression,
57 : kClassFieldExpression,
58 : kStructFieldExpression,
59 : kStdVectorOfNameAndTypeExpression,
60 : kStdVectorOfClassFieldExpression,
61 : kStdVectorOfStructFieldExpression,
62 : kIncrementDecrementOperator,
63 : kOptionalStdString,
64 : kStdVectorOfStatementPtr,
65 : kStdVectorOfDeclarationPtr,
66 : kStdVectorOfExpressionPtr,
67 : kExpressionWithSource,
68 : kParameterList,
69 : kRangeExpression,
70 : kOptionalRangeExpression,
71 : kTypeList,
72 : kOptionalTypeList,
73 : kLabelAndTypes,
74 : kStdVectorOfLabelAndTypes,
75 : kStdVectorOfLabelBlockPtr,
76 : kOptionalStatementPtr,
77 : kOptionalExpressionPtr,
78 : kTypeswitchCase,
79 : kStdVectorOfTypeswitchCase,
80 : kStdVectorOfIdentifierPtr,
81 :
82 : kJsonValue,
83 : kJsonMember,
84 : kStdVectorOfJsonValue,
85 : kStdVectorOfJsonMember,
86 : };
87 :
88 : using ParseResultTypeId = ParseResultHolderBase::TypeId;
89 :
90 : template <class T>
91 262736 : class ParseResultHolder : public ParseResultHolderBase {
92 : public:
93 164 : explicit ParseResultHolder(T value)
94 131592 : : ParseResultHolderBase(id), value_(std::move(value)) {}
95 :
96 : private:
97 : V8_EXPORT_PRIVATE static const TypeId id;
98 : friend class ParseResultHolderBase;
99 : T value_;
100 : };
101 :
102 : template <class T>
103 : T& ParseResultHolderBase::Cast() {
104 131368 : CHECK_EQ(ParseResultHolder<T>::id, type_id_);
105 1477 : return static_cast<ParseResultHolder<T>*>(this)->value_;
106 : }
107 :
108 : template <class T>
109 : const T& ParseResultHolderBase::Cast() const {
110 : CHECK_EQ(ParseResultHolder<T>::id, type_id_);
111 : return static_cast<const ParseResultHolder<T>*>(this)->value_;
112 : }
113 :
114 887417 : class ParseResult {
115 : public:
116 : template <class T>
117 278035 : explicit ParseResult(T x) : value_(new ParseResultHolder<T>(std::move(x))) {}
118 :
119 : template <class T>
120 : const T& Cast() const& {
121 : return value_->Cast<T>();
122 : }
123 : template <class T>
124 : T& Cast() & {
125 : return value_->Cast<T>();
126 : }
127 : template <class T>
128 : T&& Cast() && {
129 : return std::move(value_->Cast<T>());
130 : }
131 :
132 : private:
133 : std::unique_ptr<ParseResultHolderBase> value_;
134 : };
135 :
136 : using InputPosition = const char*;
137 :
138 : struct MatchedInput {
139 : MatchedInput(InputPosition begin, InputPosition end, SourcePosition pos)
140 302393 : : begin(begin), end(end), pos(pos) {}
141 : InputPosition begin;
142 : InputPosition end;
143 : SourcePosition pos;
144 : std::string ToString() const { return {begin, end}; }
145 : };
146 :
147 : class ParseResultIterator {
148 : public:
149 : explicit ParseResultIterator(std::vector<ParseResult> results,
150 : MatchedInput matched_input)
151 245401 : : results_(std::move(results)), matched_input_(matched_input) {}
152 490802 : ~ParseResultIterator() {
153 : // Check that all parse results have been used.
154 490802 : CHECK_EQ(results_.size(), i_);
155 245401 : }
156 :
157 244437 : ParseResult Next() {
158 244437 : CHECK_LT(i_, results_.size());
159 488874 : return std::move(results_[i_++]);
160 : }
161 : template <class T>
162 131354 : T NextAs() {
163 323087 : return std::move(Next().Cast<T>());
164 : }
165 114848 : bool HasNext() const { return i_ < results_.size(); }
166 :
167 : const MatchedInput& matched_input() const { return matched_input_; }
168 :
169 : private:
170 : std::vector<ParseResult> results_;
171 : size_t i_ = 0;
172 : MatchedInput matched_input_;
173 :
174 : DISALLOW_COPY_AND_ASSIGN(ParseResultIterator);
175 : };
176 :
177 126 : struct LexerResult {
178 : std::vector<Symbol*> token_symbols;
179 : std::vector<MatchedInput> token_contents;
180 : };
181 :
182 : using Action =
183 : base::Optional<ParseResult> (*)(ParseResultIterator* child_results);
184 :
185 113136 : inline base::Optional<ParseResult> DefaultAction(
186 : ParseResultIterator* child_results) {
187 113136 : if (!child_results->HasNext()) return base::nullopt;
188 226166 : return child_results->Next();
189 : }
190 :
191 : // A rule of the context-free grammar. Each rule can have an action attached to
192 : // it, which is executed after the parsing is finished.
193 54033 : class Rule final {
194 : public:
195 : explicit Rule(std::vector<Symbol*> right_hand_side,
196 : Action action = DefaultAction)
197 32568 : : right_hand_side_(std::move(right_hand_side)), action_(action) {}
198 :
199 : Symbol* left() const {
200 : DCHECK_NOT_NULL(left_hand_side_);
201 : return left_hand_side_;
202 : }
203 : const std::vector<Symbol*>& right() const { return right_hand_side_; }
204 :
205 : void SetLeftHandSide(Symbol* left_hand_side) {
206 : DCHECK_NULL(left_hand_side_);
207 18011 : left_hand_side_ = left_hand_side;
208 : }
209 :
210 : V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
211 : const Item* completed_item, const LexerResult& tokens) const;
212 :
213 : private:
214 : Symbol* left_hand_side_ = nullptr;
215 : std::vector<Symbol*> right_hand_side_;
216 : Action action_;
217 : };
218 :
219 : // A Symbol represents a terminal or a non-terminal of the grammar.
220 : // It stores the list of rules, which have this symbol as the
221 : // left-hand side.
222 : // Terminals have an empty list of rules, they are created by the Lexer
223 : // instead of from rules.
224 : // Symbols need to reside at stable memory addresses, because the addresses are
225 : // used in the parser.
226 11397 : class Symbol {
227 : public:
228 9932 : Symbol() : Symbol({}) {}
229 13909 : Symbol(std::initializer_list<Rule> rules) { *this = rules; }
230 :
231 : V8_EXPORT_PRIVATE Symbol& operator=(std::initializer_list<Rule> rules);
232 :
233 : bool IsTerminal() const { return rules_.empty(); }
234 : Rule* rule(size_t index) const { return rules_[index].get(); }
235 : size_t rule_number() const { return rules_.size(); }
236 :
237 18011 : void AddRule(const Rule& rule) {
238 18011 : rules_.push_back(base::make_unique<Rule>(rule));
239 : rules_.back()->SetLeftHandSide(this);
240 18011 : }
241 :
242 : V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
243 : const Item* item, const LexerResult& tokens);
244 :
245 : private:
246 : std::vector<std::unique_ptr<Rule>> rules_;
247 :
248 : // Disallow copying and moving to ensure Symbol has a stable address.
249 : DISALLOW_COPY_AND_ASSIGN(Symbol);
250 : };
251 :
252 : // Items are the core datastructure of Earley's algorithm.
253 : // They consist of a (partially) matched rule, a marked position inside of the
254 : // right-hand side of the rule (traditionally written as a dot) and an input
255 : // range from {start} to {pos} that matches the symbols of the right-hand side
256 : // that are left of the mark. In addition, they store a child and a left-sibling
257 : // pointer to reconstruct the AST in the end.
258 : class Item {
259 : public:
260 : Item(const Rule* rule, size_t mark, size_t start, size_t pos)
261 6001988 : : rule_(rule), mark_(mark), start_(start), pos_(pos) {
262 : DCHECK_LE(mark_, right().size());
263 : }
264 :
265 : // A complete item has the mark at the right end, which means the input range
266 : // matches the complete rule.
267 : bool IsComplete() const {
268 : DCHECK_LE(mark_, right().size());
269 2428458 : return mark_ == right().size();
270 : }
271 :
272 : // The symbol right after the mark is expected at {pos} for this item to
273 : // advance.
274 : Symbol* NextSymbol() const {
275 : DCHECK(!IsComplete());
276 : DCHECK_LT(mark_, right().size());
277 2035485 : return right()[mark_];
278 : }
279 :
280 : // We successfully parsed NextSymbol() between {pos} and {new_pos}.
281 : // If NextSymbol() was a non-terminal, then {child} is a pointer to a
282 : // completed item for this parse.
283 : // We create a new item, which moves the mark one forward.
284 : Item Advance(size_t new_pos, const Item* child = nullptr) const {
285 : if (child) {
286 : DCHECK(child->IsComplete());
287 : DCHECK_EQ(pos(), child->start());
288 : DCHECK_EQ(new_pos, child->pos());
289 : DCHECK_EQ(NextSymbol(), child->left());
290 : }
291 710884 : Item result(rule_, mark_ + 1, start_, new_pos);
292 710884 : result.prev_ = this;
293 646066 : result.child_ = child;
294 : return result;
295 : }
296 :
297 : // Collect the items representing the AST children of this completed item.
298 : std::vector<const Item*> Children() const;
299 : // The matched input separated according to the next branching AST level.
300 : std::string SplitByChildren(const LexerResult& tokens) const;
301 : // Check if {other} results in the same AST as this Item.
302 : void CheckAmbiguity(const Item& other, const LexerResult& tokens) const;
303 :
304 : MatchedInput GetMatchedInput(const LexerResult& tokens) const {
305 245401 : return {tokens.token_contents[start_].begin,
306 : start_ == pos_ ? tokens.token_contents[start_].begin
307 209320 : : tokens.token_contents[pos_ - 1].end,
308 454721 : tokens.token_contents[start_].pos};
309 : }
310 :
311 : // We exclude {prev_} and {child_} from equality and hash computations,
312 : // because they are just globally unique data associated with an item.
313 : bool operator==(const Item& other) const {
314 2787078 : return rule_ == other.rule_ && mark_ == other.mark_ &&
315 2787078 : start_ == other.start_ && pos_ == other.pos_;
316 : }
317 :
318 6001988 : friend size_t hash_value(const Item& i) {
319 6001988 : return base::hash_combine(i.rule_, i.mark_, i.start_, i.pos_);
320 : }
321 :
322 245338 : const Rule* rule() const { return rule_; }
323 : Symbol* left() const { return rule_->left(); }
324 : const std::vector<Symbol*>& right() const { return rule_->right(); }
325 : size_t pos() const { return pos_; }
326 : size_t start() const { return start_; }
327 :
328 : private:
329 : const Rule* rule_;
330 : size_t mark_;
331 : size_t start_;
332 : size_t pos_;
333 :
334 : const Item* prev_ = nullptr;
335 : const Item* child_ = nullptr;
336 : };
337 :
338 : inline base::Optional<ParseResult> Symbol::RunAction(
339 : const Item* item, const LexerResult& tokens) {
340 : DCHECK(item->IsComplete());
341 : DCHECK_EQ(item->left(), this);
342 245401 : return item->rule()->RunAction(item, tokens);
343 : }
344 :
345 : V8_EXPORT_PRIVATE const Item* RunEarleyAlgorithm(
346 : Symbol* start, const LexerResult& tokens,
347 : std::unordered_set<Item, base::hash<Item>>* processed);
348 :
349 63 : inline base::Optional<ParseResult> ParseTokens(Symbol* start,
350 : const LexerResult& tokens) {
351 : std::unordered_set<Item, base::hash<Item>> table;
352 63 : const Item* final_item = RunEarleyAlgorithm(start, tokens, &table);
353 63 : return start->RunAction(final_item, tokens);
354 : }
355 :
356 : // The lexical syntax is dynamically defined while building the grammar by
357 : // adding patterns and keywords to the Lexer.
358 : // The term keyword here can stand for any fixed character sequence, including
359 : // operators and parentheses.
360 : // Each pattern or keyword automatically gets a terminal symbol associated with
361 : // it. These symbols form the result of the lexing.
362 : // Patterns and keywords are matched using the longest match principle. If the
363 : // longest matching pattern coincides with a keyword, the keyword symbol is
364 : // chosen instead of the pattern.
365 : // In addition, there is a single whitespace pattern which is consumed but does
366 : // not become part of the token list.
367 186 : class Lexer {
368 : public:
369 : // Functions to define patterns. They try to match starting from {pos}. If
370 : // successful, they return true and advance {pos}. Otherwise, {pos} stays
371 : // unchanged.
372 : using PatternFunction = bool (*)(InputPosition* pos);
373 :
374 : void SetWhitespace(PatternFunction whitespace) {
375 62 : match_whitespace_ = whitespace;
376 : }
377 :
378 282 : Symbol* Pattern(PatternFunction pattern) { return &patterns_[pattern]; }
379 11003 : Symbol* Token(const std::string& keyword) { return &keywords_[keyword]; }
380 : V8_EXPORT_PRIVATE LexerResult RunLexer(const std::string& input);
381 :
382 : private:
383 0 : PatternFunction match_whitespace_ = [](InputPosition*) { return false; };
384 : std::map<PatternFunction, Symbol> patterns_;
385 : std::map<std::string, Symbol> keywords_;
386 : Symbol* MatchToken(InputPosition* pos, InputPosition end);
387 : };
388 :
389 : // A grammar can have a result, which is the results of the start symbol.
390 : // Grammar is intended to be subclassed, with Symbol members forming the
391 : // mutually recursive rules of the grammar.
392 62 : class Grammar {
393 : public:
394 : using PatternFunction = Lexer::PatternFunction;
395 :
396 62 : explicit Grammar(Symbol* start) : start_(start) {}
397 :
398 63 : base::Optional<ParseResult> Parse(const std::string& input) {
399 126 : LexerResult tokens = lexer().RunLexer(input);
400 126 : return ParseTokens(start_, tokens);
401 : }
402 :
403 : protected:
404 : Symbol* Token(const std::string& s) { return lexer_.Token(s); }
405 : Symbol* Pattern(PatternFunction pattern) { return lexer_.Pattern(pattern); }
406 : void SetWhitespace(PatternFunction ws) { lexer_.SetWhitespace(ws); }
407 :
408 : // NewSymbol() allocates a fresh symbol and stores it in the current grammar.
409 : // This is necessary to define helpers that create new symbols.
410 6369 : Symbol* NewSymbol(std::initializer_list<Rule> rules = {}) {
411 6369 : Symbol* result = new Symbol(rules);
412 12738 : generated_symbols_.push_back(std::unique_ptr<Symbol>(result));
413 6369 : return result;
414 : }
415 :
416 : // Helper functions to define lexer patterns. If they match, they return true
417 : // and advance {pos}. Otherwise, {pos} is unchanged.
418 : V8_EXPORT_PRIVATE static bool MatchChar(int (*char_class)(int),
419 : InputPosition* pos);
420 : V8_EXPORT_PRIVATE static bool MatchChar(bool (*char_class)(char),
421 : InputPosition* pos);
422 : V8_EXPORT_PRIVATE static bool MatchAnyChar(InputPosition* pos);
423 : V8_EXPORT_PRIVATE static bool MatchString(const char* s, InputPosition* pos);
424 :
425 : // The action MatchInput() produces the input matched by the rule as
426 : // result.
427 23750 : static base::Optional<ParseResult> YieldMatchedInput(
428 : ParseResultIterator* child_results) {
429 71250 : return ParseResult{child_results->matched_input().ToString()};
430 : }
431 :
432 : // Create a new symbol to parse the given sequence of symbols.
433 : // At most one of the symbols can return a result.
434 1078 : Symbol* Sequence(std::vector<Symbol*> symbols) {
435 2156 : return NewSymbol({Rule(std::move(symbols))});
436 : }
437 :
438 : template <class T, T value>
439 8947 : static base::Optional<ParseResult> YieldIntegralConstant(
440 : ParseResultIterator* child_results) {
441 8947 : return ParseResult{value};
442 : }
443 :
444 : template <class T>
445 28314 : static base::Optional<ParseResult> YieldDefaultValue(
446 : ParseResultIterator* child_results) {
447 69895 : return ParseResult{T{}};
448 : }
449 :
450 : template <class From, class To>
451 16266 : static base::Optional<ParseResult> CastParseResult(
452 : ParseResultIterator* child_results) {
453 23835 : To result = std::move(child_results->NextAs<From>());
454 33320 : return ParseResult{std::move(result)};
455 : }
456 :
457 : // Try to parse {s} and return the result of type {Result} casted to {T}.
458 : // Otherwise, the result is a default-constructed {T}.
459 : template <class T, class Result = T>
460 2621 : Symbol* TryOrDefault(Symbol* s) {
461 18347 : return NewSymbol({Rule({s}, CastParseResult<Result, T>),
462 5242 : Rule({}, YieldDefaultValue<T>)});
463 : }
464 :
465 : template <class T>
466 5994 : static base::Optional<ParseResult> MakeSingletonVector(
467 : ParseResultIterator* child_results) {
468 6143 : T x = child_results->NextAs<T>();
469 344 : std::vector<T> result;
470 : result.push_back(std::move(x));
471 23632 : return ParseResult{std::move(result)};
472 : }
473 :
474 : template <class T>
475 5819 : static base::Optional<ParseResult> MakeExtendedVector(
476 : ParseResultIterator* child_results) {
477 6051 : std::vector<T> l = child_results->NextAs<std::vector<T>>();
478 5845 : T x = child_results->NextAs<T>();
479 : l.push_back(std::move(x));
480 23044 : return ParseResult{std::move(l)};
481 : }
482 :
483 : // For example, NonemptyList(Token("A"), Token(",")) parses any of
484 : // A or A,A or A,A,A and so on.
485 : template <class T>
486 1102 : Symbol* NonemptyList(Symbol* element,
487 : base::Optional<Symbol*> separator = {}) {
488 1102 : Symbol* list = NewSymbol();
489 7200 : *list = {Rule({element}, MakeSingletonVector<T>),
490 : separator
491 : ? Rule({list, *separator, element}, MakeExtendedVector<T>)
492 : : Rule({list, element}, MakeExtendedVector<T>)};
493 1102 : return list;
494 : }
495 :
496 : template <class T>
497 : Symbol* List(Symbol* element, base::Optional<Symbol*> separator = {}) {
498 906 : return TryOrDefault<std::vector<T>>(NonemptyList<T>(element, separator));
499 : }
500 :
501 : template <class T>
502 : Symbol* Optional(Symbol* x) {
503 1078 : return TryOrDefault<base::Optional<T>, T>(x);
504 : }
505 :
506 735 : Symbol* CheckIf(Symbol* x) {
507 5145 : return NewSymbol({Rule({x}, YieldIntegralConstant<bool, true>),
508 1470 : Rule({}, YieldIntegralConstant<bool, false>)});
509 : }
510 :
511 63 : Lexer& lexer() { return lexer_; }
512 :
513 : private:
514 : Lexer lexer_;
515 : std::vector<std::unique_ptr<Symbol>> generated_symbols_;
516 : Symbol* start_;
517 : };
518 :
519 : } // namespace torque
520 : } // namespace internal
521 : } // namespace v8
522 :
523 : #endif // V8_TORQUE_EARLEY_PARSER_H_
|