Line data Source code
1 : // Copyright 2011 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : // Features shared by parsing and pre-parsing scanners.
6 :
7 : #ifndef V8_PARSING_SCANNER_H_
8 : #define V8_PARSING_SCANNER_H_
9 :
10 : #include <algorithm>
11 :
12 : #include "src/allocation.h"
13 : #include "src/base/logging.h"
14 : #include "src/char-predicates.h"
15 : #include "src/globals.h"
16 : #include "src/message-template.h"
17 : #include "src/parsing/token.h"
18 : #include "src/pointer-with-payload.h"
19 : #include "src/unicode-decoder.h"
20 : #include "src/unicode.h"
21 :
22 : namespace v8 {
23 : namespace internal {
24 :
25 : class AstRawString;
26 : class AstValueFactory;
27 : class ExternalOneByteString;
28 : class ExternalTwoByteString;
29 : class ParserRecorder;
30 : class RuntimeCallStats;
31 : class Zone;
32 :
33 : // ---------------------------------------------------------------------
34 : // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
35 : // A code unit is a 16 bit value representing either a 16 bit code point
36 : // or one part of a surrogate pair that make a single 21 bit code point.
37 : class Utf16CharacterStream {
38 : public:
39 : static const uc32 kEndOfInput = -1;
40 :
41 2992370 : virtual ~Utf16CharacterStream() = default;
42 :
43 : V8_INLINE void set_parser_error() {
44 718326 : buffer_cursor_ = buffer_end_;
45 718326 : has_parser_error_ = true;
46 : }
47 49124 : V8_INLINE void reset_parser_error_flag() { has_parser_error_ = false; }
48 54755015 : V8_INLINE bool has_parser_error() const { return has_parser_error_; }
49 :
50 862450221 : inline uc32 Peek() {
51 862450221 : if (V8_LIKELY(buffer_cursor_ < buffer_end_)) {
52 855585356 : return static_cast<uc32>(*buffer_cursor_);
53 6864868 : } else if (ReadBlockChecked()) {
54 4563595 : return static_cast<uc32>(*buffer_cursor_);
55 : } else {
56 : return kEndOfInput;
57 : }
58 : }
59 :
60 : // Returns and advances past the next UTF-16 code unit in the input
61 : // stream. If there are no more code units it returns kEndOfInput.
62 : inline uc32 Advance() {
63 855528044 : uc32 result = Peek();
64 855354790 : buffer_cursor_++;
65 : return result;
66 : }
67 :
68 : // Returns and advances past the next UTF-16 code unit in the input stream
69 : // that meets the checks requirement. If there are no more code units it
70 : // returns kEndOfInput.
71 : template <typename FunctionType>
72 : V8_INLINE uc32 AdvanceUntil(FunctionType check) {
73 : while (true) {
74 : auto next_cursor_pos =
75 154735650 : std::find_if(buffer_cursor_, buffer_end_, [&check](uint16_t raw_c0_) {
76 972081665 : uc32 c0_ = static_cast<uc32>(raw_c0_);
77 972015021 : return check(c0_);
78 154735650 : });
79 :
80 154745555 : if (next_cursor_pos == buffer_end_) {
81 2316199 : buffer_cursor_ = buffer_end_;
82 2316201 : if (!ReadBlockChecked()) {
83 115732 : buffer_cursor_++;
84 : return kEndOfInput;
85 : }
86 : } else {
87 152429356 : buffer_cursor_ = next_cursor_pos + 1;
88 152429356 : return static_cast<uc32>(*next_cursor_pos);
89 : }
90 : }
91 : }
92 :
93 : // Go back one by one character in the input stream.
94 : // This undoes the most recent Advance().
95 10414536 : inline void Back() {
96 : // The common case - if the previous character is within
97 : // buffer_start_ .. buffer_end_ will be handles locally.
98 : // Otherwise, a new block is requested.
99 10414536 : if (V8_LIKELY(buffer_cursor_ > buffer_start_)) {
100 10210611 : buffer_cursor_--;
101 : } else {
102 203925 : ReadBlockAt(pos() - 1);
103 : }
104 10414536 : }
105 :
106 : inline size_t pos() const {
107 1439281511 : return buffer_pos_ + (buffer_cursor_ - buffer_start_);
108 : }
109 :
110 115588 : inline void Seek(size_t pos) {
111 115588 : if (V8_LIKELY(pos >= buffer_pos_ &&
112 : pos < (buffer_pos_ + (buffer_end_ - buffer_start_)))) {
113 97645 : buffer_cursor_ = buffer_start_ + (pos - buffer_pos_);
114 : } else {
115 : ReadBlockAt(pos);
116 : }
117 115588 : }
118 :
119 : // Returns true if the stream could access the V8 heap after construction.
120 : bool can_be_cloned_for_parallel_access() const {
121 66 : return can_be_cloned() && !can_access_heap();
122 : }
123 :
124 : // Returns true if the stream can be cloned with Clone.
125 : // TODO(rmcilroy): Remove this once ChunkedStreams can be cloned.
126 : virtual bool can_be_cloned() const = 0;
127 :
128 : // Clones the character stream to enable another independent scanner to access
129 : // the same underlying stream.
130 : virtual std::unique_ptr<Utf16CharacterStream> Clone() const = 0;
131 :
132 : // Returns true if the stream could access the V8 heap after construction.
133 : virtual bool can_access_heap() const = 0;
134 :
135 : RuntimeCallStats* runtime_call_stats() const { return runtime_call_stats_; }
136 : void set_runtime_call_stats(RuntimeCallStats* runtime_call_stats) {
137 13468 : runtime_call_stats_ = runtime_call_stats;
138 : }
139 :
140 : protected:
141 : Utf16CharacterStream(const uint16_t* buffer_start,
142 : const uint16_t* buffer_cursor,
143 : const uint16_t* buffer_end, size_t buffer_pos)
144 : : buffer_start_(buffer_start),
145 : buffer_cursor_(buffer_cursor),
146 : buffer_end_(buffer_end),
147 2992391 : buffer_pos_(buffer_pos) {}
148 : Utf16CharacterStream() : Utf16CharacterStream(nullptr, nullptr, nullptr, 0) {}
149 :
150 592769 : bool ReadBlockChecked() {
151 : size_t position = pos();
152 : USE(position);
153 9402925 : bool success = !has_parser_error() && ReadBlock();
154 :
155 : // Post-conditions: 1, We should always be at the right position.
156 : // 2, Cursor should be inside the buffer.
157 : // 3, We should have more characters available iff success.
158 : DCHECK_EQ(pos(), position);
159 : DCHECK_LE(buffer_cursor_, buffer_end_);
160 : DCHECK_LE(buffer_start_, buffer_cursor_);
161 : DCHECK_EQ(success, buffer_cursor_ < buffer_end_);
162 592779 : return success;
163 : }
164 :
165 : void ReadBlockAt(size_t new_pos) {
166 : // The callers of this method (Back/Back2/Seek) should handle the easy
167 : // case (seeking within the current buffer), and we should only get here
168 : // if we actually require new data.
169 : // (This is really an efficiency check, not a correctness invariant.)
170 : DCHECK(new_pos < buffer_pos_ ||
171 : new_pos >= buffer_pos_ + (buffer_end_ - buffer_start_));
172 :
173 : // Change pos() to point to new_pos.
174 221868 : buffer_pos_ = new_pos;
175 221868 : buffer_cursor_ = buffer_start_;
176 : DCHECK_EQ(pos(), new_pos);
177 : ReadBlockChecked();
178 : }
179 :
180 : // Read more data, and update buffer_*_ to point to it.
181 : // Returns true if more data was available.
182 : //
183 : // ReadBlock() may modify any of the buffer_*_ members, but must sure that
184 : // the result of pos() remains unaffected.
185 : //
186 : // Examples:
187 : // - a stream could either fill a separate buffer. Then buffer_start_ and
188 : // buffer_cursor_ would point to the beginning of the buffer, and
189 : // buffer_pos would be the old pos().
190 : // - a stream with existing buffer chunks would set buffer_start_ and
191 : // buffer_end_ to cover the full chunk, and then buffer_cursor_ would
192 : // point into the middle of the buffer, while buffer_pos_ would describe
193 : // the start of the buffer.
194 : virtual bool ReadBlock() = 0;
195 :
196 : const uint16_t* buffer_start_;
197 : const uint16_t* buffer_cursor_;
198 : const uint16_t* buffer_end_;
199 : size_t buffer_pos_;
200 : RuntimeCallStats* runtime_call_stats_;
201 : bool has_parser_error_ = false;
202 : };
203 :
204 : // ----------------------------------------------------------------------------
205 : // JavaScript Scanner.
206 :
207 5983354 : class V8_EXPORT_PRIVATE Scanner {
208 : public:
209 : // Scoped helper for a re-settable bookmark.
210 : class V8_EXPORT_PRIVATE BookmarkScope {
211 : public:
212 : explicit BookmarkScope(Scanner* scanner)
213 : : scanner_(scanner),
214 : bookmark_(kNoBookmark),
215 5129896 : had_parser_error_(scanner->has_parser_error()) {
216 : DCHECK_NOT_NULL(scanner_);
217 : }
218 : ~BookmarkScope() = default;
219 :
220 : void Set(size_t bookmark);
221 : void Apply();
222 : bool HasBeenSet() const;
223 : bool HasBeenApplied() const;
224 :
225 : private:
226 : static const size_t kNoBookmark;
227 : static const size_t kBookmarkWasApplied;
228 :
229 : Scanner* scanner_;
230 : size_t bookmark_;
231 : bool had_parser_error_;
232 :
233 : DISALLOW_COPY_AND_ASSIGN(BookmarkScope);
234 : };
235 :
236 : // Sets the Scanner into an error state to stop further scanning and terminate
237 : // the parsing by only returning ILLEGAL tokens after that.
238 : V8_INLINE void set_parser_error() {
239 4052489 : if (!has_parser_error()) {
240 718326 : c0_ = kEndOfInput;
241 : source_->set_parser_error();
242 718326 : for (TokenDesc& desc : token_storage_) desc.token = Token::ILLEGAL;
243 : }
244 : }
245 : V8_INLINE void reset_parser_error_flag() {
246 49124 : source_->reset_parser_error_flag();
247 : }
248 : V8_INLINE bool has_parser_error() const {
249 40475932 : return source_->has_parser_error();
250 : }
251 :
252 : // Representation of an interval of source positions.
253 : struct Location {
254 57735226 : Location(int b, int e) : beg_pos(b), end_pos(e) { }
255 238493931 : Location() : beg_pos(0), end_pos(0) { }
256 :
257 2640881 : int length() const { return end_pos - beg_pos; }
258 529580209 : bool IsValid() const { return IsInRange(beg_pos, 0, end_pos); }
259 :
260 : static Location invalid() { return Location(-1, 0); }
261 :
262 : int beg_pos;
263 : int end_pos;
264 : };
265 :
266 : // -1 is outside of the range of any real source code.
267 : static const int kNoOctalLocation = -1;
268 : static const uc32 kEndOfInput = Utf16CharacterStream::kEndOfInput;
269 :
270 : explicit Scanner(Utf16CharacterStream* source, bool is_module);
271 :
272 : void Initialize();
273 :
274 : // Returns the next token and advances input.
275 : Token::Value Next();
276 : // Returns the token following peek()
277 : Token::Value PeekAhead();
278 : // Returns the current token again.
279 58628460 : Token::Value current_token() const { return current().token; }
280 :
281 : // Returns the location information for the current token
282 : // (the token last returned by Next()).
283 3983855 : const Location& location() const { return current().location; }
284 :
285 : // This error is specifically an invalid hex or unicode escape sequence.
286 : bool has_error() const { return scanner_error_ != MessageTemplate::kNone; }
287 : MessageTemplate error() const { return scanner_error_; }
288 : const Location& error_location() const { return scanner_error_location_; }
289 :
290 : bool has_invalid_template_escape() const {
291 153395 : return current().invalid_template_escape_message != MessageTemplate::kNone;
292 : }
293 : MessageTemplate invalid_template_escape_message() const {
294 : DCHECK(has_invalid_template_escape());
295 : return current().invalid_template_escape_message;
296 : }
297 :
298 : void clear_invalid_template_escape_message() {
299 : DCHECK(has_invalid_template_escape());
300 12980 : current_->invalid_template_escape_message = MessageTemplate::kNone;
301 : }
302 :
303 : Location invalid_template_escape_location() const {
304 : DCHECK(has_invalid_template_escape());
305 6344 : return current().invalid_template_escape_location;
306 : }
307 :
308 : // Similar functions for the upcoming token.
309 :
310 : // One token look-ahead (past the token returned by Next()).
311 1846418569 : Token::Value peek() const { return next().token; }
312 :
313 113226 : const Location& peek_location() const { return next().location; }
314 :
315 : bool literal_contains_escapes() const {
316 : return LiteralContainsEscapes(current());
317 : }
318 :
319 : bool next_literal_contains_escapes() const {
320 : return LiteralContainsEscapes(next());
321 : }
322 :
323 : const AstRawString* CurrentSymbol(AstValueFactory* ast_value_factory) const;
324 :
325 : const AstRawString* NextSymbol(AstValueFactory* ast_value_factory) const;
326 : const AstRawString* CurrentRawSymbol(
327 : AstValueFactory* ast_value_factory) const;
328 :
329 : double DoubleValue();
330 :
331 : const char* CurrentLiteralAsCString(Zone* zone) const;
332 :
333 : inline bool CurrentMatches(Token::Value token) const {
334 : DCHECK(Token::IsKeyword(token));
335 : return current().token == token;
336 : }
337 :
338 : template <size_t N>
339 2644617 : bool NextLiteralExactlyEquals(const char (&s)[N]) {
340 : DCHECK(next().CanAccessLiteral());
341 : // The length of the token is used to make sure the literal equals without
342 : // taking escape sequences (e.g., "use \x73trict") or line continuations
343 : // (e.g., "use \(newline) strict") into account.
344 2644617 : if (!is_next_literal_one_byte()) return false;
345 2640881 : if (peek_location().length() != N + 1) return false;
346 :
347 : Vector<const uint8_t> next = next_literal_one_byte_string();
348 : const char* chars = reinterpret_cast<const char*>(next.start());
349 430619 : return next.length() == N - 1 && strncmp(s, chars, N - 1) == 0;
350 : }
351 :
352 : template <size_t N>
353 4490385 : bool CurrentLiteralEquals(const char (&s)[N]) {
354 : DCHECK(current().CanAccessLiteral());
355 4490385 : if (!is_literal_one_byte()) return false;
356 :
357 : Vector<const uint8_t> current = literal_one_byte_string();
358 : const char* chars = reinterpret_cast<const char*>(current.start());
359 4490364 : return current.length() == N - 1 && strncmp(s, chars, N - 1) == 0;
360 : }
361 :
362 : // Returns the location of the last seen octal literal.
363 : Location octal_position() const { return octal_pos_; }
364 : void clear_octal_position() {
365 2865 : octal_pos_ = Location::invalid();
366 2865 : octal_message_ = MessageTemplate::kNone;
367 : }
368 : MessageTemplate octal_message() const { return octal_message_; }
369 :
370 : // Returns the value of the last smi that was scanned.
371 20232000 : uint32_t smi_value() const { return current().smi_value_; }
372 :
373 : // Seek forward to the given position. This operation does not
374 : // work in general, for instance when there are pushed back
375 : // characters, but works for seeking forward until simple delimiter
376 : // tokens, which is what it is used for.
377 : void SeekForward(int pos);
378 :
379 : // Returns true if there was a line terminator before the peek'ed token,
380 : // possibly inside a multi-line comment.
381 : bool HasLineTerminatorBeforeNext() const {
382 9722561 : return next().after_line_terminator;
383 : }
384 :
385 : bool HasLineTerminatorAfterNext() {
386 161785 : Token::Value ensure_next_next = PeekAhead();
387 : USE(ensure_next_next);
388 161785 : return next_next().after_line_terminator;
389 : }
390 :
391 : // Scans the input as a regular expression pattern, next token must be /(=).
392 : // Returns true if a pattern is scanned.
393 : bool ScanRegExpPattern();
394 : // Scans the input as regular expression flags. Returns the flags on success.
395 : Maybe<RegExp::Flags> ScanRegExpFlags();
396 :
397 : // Scans the input as a template literal
398 : Token::Value ScanTemplateContinuation() {
399 : DCHECK_EQ(next().token, Token::RBRACE);
400 : DCHECK_EQ(source_pos() - 1, next().location.beg_pos);
401 80011 : return ScanTemplateSpan();
402 : }
403 :
404 : Handle<String> SourceUrl(Isolate* isolate) const;
405 : Handle<String> SourceMappingUrl(Isolate* isolate) const;
406 :
407 : bool FoundHtmlComment() const { return found_html_comment_; }
408 :
409 : bool allow_harmony_private_fields() const {
410 : return allow_harmony_private_fields_;
411 : }
412 : void set_allow_harmony_private_fields(bool allow) {
413 3389976 : allow_harmony_private_fields_ = allow;
414 : }
415 : bool allow_harmony_numeric_separator() const {
416 : return allow_harmony_numeric_separator_;
417 : }
418 : void set_allow_harmony_numeric_separator(bool allow) {
419 2989926 : allow_harmony_numeric_separator_ = allow;
420 : }
421 :
422 : const Utf16CharacterStream* stream() const { return source_; }
423 :
424 : // If the next characters in the stream are "#!", the line is skipped.
425 : void SkipHashBang();
426 :
427 : private:
428 : // Scoped helper for saving & restoring scanner error state.
429 : // This is used for tagged template literals, in which normally forbidden
430 : // escape sequences are allowed.
431 : class ErrorState;
432 :
433 : // LiteralBuffer - Collector of chars of literals.
434 : class LiteralBuffer {
435 : public:
436 23937160 : LiteralBuffer() : backing_store_(), position_(0), is_one_byte_(true) {}
437 :
438 23937460 : ~LiteralBuffer() { backing_store_.Dispose(); }
439 :
440 : V8_INLINE void AddChar(char code_unit) {
441 : DCHECK(IsValidAscii(code_unit));
442 806140383 : AddOneByteChar(static_cast<byte>(code_unit));
443 : }
444 :
445 : V8_INLINE void AddChar(uc32 code_unit) {
446 273311185 : if (is_one_byte()) {
447 264573432 : if (code_unit <= static_cast<uc32>(unibrow::Latin1::kMaxChar)) {
448 : AddOneByteChar(static_cast<byte>(code_unit));
449 : return;
450 : }
451 78150 : ConvertToTwoByte();
452 : }
453 8800740 : AddTwoByteChar(code_unit);
454 : }
455 :
456 : bool is_one_byte() const { return is_one_byte_; }
457 :
458 : bool Equals(Vector<const char> keyword) const {
459 : return is_one_byte() && keyword.length() == position_ &&
460 : (memcmp(keyword.start(), backing_store_.start(), position_) == 0);
461 : }
462 :
463 : Vector<const uint16_t> two_byte_literal() const {
464 : DCHECK(!is_one_byte());
465 : DCHECK_EQ(position_ & 0x1, 0);
466 : return Vector<const uint16_t>(
467 : reinterpret_cast<const uint16_t*>(backing_store_.start()),
468 77419 : position_ >> 1);
469 : }
470 :
471 : Vector<const uint8_t> one_byte_literal() const {
472 : DCHECK(is_one_byte());
473 : return Vector<const uint8_t>(
474 213704467 : reinterpret_cast<const uint8_t*>(backing_store_.start()), position_);
475 : }
476 :
477 45670553 : int length() const { return is_one_byte() ? position_ : (position_ >> 1); }
478 :
479 : void Start() {
480 183217462 : position_ = 0;
481 183217462 : is_one_byte_ = true;
482 : }
483 :
484 : Handle<String> Internalize(Isolate* isolate) const;
485 :
486 : private:
487 : static const int kInitialCapacity = 16;
488 : static const int kGrowthFactor = 4;
489 : static const int kMaxGrowth = 1 * MB;
490 :
491 : inline bool IsValidAscii(char code_unit) {
492 : // Control characters and printable characters span the range of
493 : // valid ASCII characters (0-127). Chars are unsigned on some
494 : // platforms which causes compiler warnings if the validity check
495 : // tests the lower bound >= 0 as it's always true.
496 : return iscntrl(code_unit) || isprint(code_unit);
497 : }
498 :
499 : V8_INLINE void AddOneByteChar(byte one_byte_char) {
500 : DCHECK(is_one_byte());
501 1322699436 : if (position_ >= backing_store_.length()) ExpandBuffer();
502 1322698611 : backing_store_[position_] = one_byte_char;
503 1070383282 : position_ += kOneByteSize;
504 : }
505 :
506 : void AddTwoByteChar(uc32 code_unit);
507 : int NewCapacity(int min_capacity);
508 : void ExpandBuffer();
509 : void ConvertToTwoByte();
510 :
511 : Vector<byte> backing_store_;
512 : int position_;
513 :
514 : bool is_one_byte_;
515 :
516 : DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
517 : };
518 :
519 : // The current and look-ahead token.
520 35899890 : struct TokenDesc {
521 : Location location = {0, 0};
522 : LiteralBuffer literal_chars;
523 : LiteralBuffer raw_literal_chars;
524 : Token::Value token = Token::UNINITIALIZED;
525 : MessageTemplate invalid_template_escape_message = MessageTemplate::kNone;
526 : Location invalid_template_escape_location;
527 : uint32_t smi_value_ = 0;
528 : bool after_line_terminator = false;
529 :
530 : #ifdef DEBUG
531 : bool CanAccessLiteral() const {
532 : return token == Token::PRIVATE_NAME || token == Token::ILLEGAL ||
533 : token == Token::UNINITIALIZED || token == Token::REGEXP_LITERAL ||
534 : IsInRange(token, Token::NUMBER, Token::STRING) ||
535 : Token::IsAnyIdentifier(token) || Token::IsKeyword(token) ||
536 : IsInRange(token, Token::TEMPLATE_SPAN, Token::TEMPLATE_TAIL);
537 : }
538 : bool CanAccessRawLiteral() const {
539 : return token == Token::ILLEGAL || token == Token::UNINITIALIZED ||
540 : IsInRange(token, Token::TEMPLATE_SPAN, Token::TEMPLATE_TAIL);
541 : }
542 : #endif // DEBUG
543 : };
544 :
545 : enum NumberKind {
546 : IMPLICIT_OCTAL,
547 : BINARY,
548 : OCTAL,
549 : HEX,
550 : DECIMAL,
551 : DECIMAL_WITH_LEADING_ZERO
552 : };
553 :
554 : inline bool IsValidBigIntKind(NumberKind kind) {
555 : return IsInRange(kind, BINARY, DECIMAL);
556 : }
557 :
558 : inline bool IsDecimalNumberKind(NumberKind kind) {
559 : return IsInRange(kind, DECIMAL, DECIMAL_WITH_LEADING_ZERO);
560 : }
561 :
562 : static const int kCharacterLookaheadBufferSize = 1;
563 : static const int kMaxAscii = 127;
564 :
565 : // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
566 : template <bool capture_raw>
567 : uc32 ScanOctalEscape(uc32 c, int length);
568 :
569 : // Call this after setting source_ to the input.
570 2991663 : void Init() {
571 : // Set c0_ (one character ahead)
572 : STATIC_ASSERT(kCharacterLookaheadBufferSize == 1);
573 : Advance();
574 :
575 2991669 : current_ = &token_storage_[0];
576 2991669 : next_ = &token_storage_[1];
577 2991669 : next_next_ = &token_storage_[2];
578 :
579 2991669 : found_html_comment_ = false;
580 2991669 : scanner_error_ = MessageTemplate::kNone;
581 2991669 : }
582 :
583 : void ReportScannerError(const Location& location, MessageTemplate error) {
584 10208 : if (has_error()) return;
585 10181 : scanner_error_ = error;
586 10181 : scanner_error_location_ = location;
587 : }
588 :
589 : void ReportScannerError(int pos, MessageTemplate error) {
590 234155 : if (has_error()) return;
591 115276 : scanner_error_ = error;
592 115276 : scanner_error_location_ = Location(pos, pos + 1);
593 : }
594 :
595 : // Seek to the next_ token at the given position.
596 : void SeekNext(size_t position);
597 :
598 271243217 : V8_INLINE void AddLiteralChar(uc32 c) { next().literal_chars.AddChar(c); }
599 :
600 806143503 : V8_INLINE void AddLiteralChar(char c) { next().literal_chars.AddChar(c); }
601 :
602 : V8_INLINE void AddRawLiteralChar(uc32 c) {
603 1990615 : next().raw_literal_chars.AddChar(c);
604 : }
605 :
606 : V8_INLINE void AddLiteralCharAdvance() {
607 103382 : AddLiteralChar(c0_);
608 204076 : Advance();
609 : }
610 :
611 : // Low-level scanning support.
612 : template <bool capture_raw = false>
613 239665 : void Advance() {
614 : if (capture_raw) {
615 35589 : AddRawLiteralChar(c0_);
616 : }
617 803665383 : c0_ = source_->Advance();
618 239665 : }
619 :
620 : template <typename FunctionType>
621 : V8_INLINE void AdvanceUntil(FunctionType check) {
622 305080104 : c0_ = source_->AdvanceUntil(check);
623 : }
624 :
625 4893128 : bool CombineSurrogatePair() {
626 : DCHECK(!unibrow::Utf16::IsLeadSurrogate(kEndOfInput));
627 9786256 : if (unibrow::Utf16::IsLeadSurrogate(c0_)) {
628 174 : uc32 c1 = source_->Advance();
629 : DCHECK(!unibrow::Utf16::IsTrailSurrogate(kEndOfInput));
630 174 : if (unibrow::Utf16::IsTrailSurrogate(c1)) {
631 294 : c0_ = unibrow::Utf16::CombineSurrogatePair(c0_, c1);
632 : return true;
633 : }
634 27 : source_->Back();
635 : }
636 : return false;
637 : }
638 :
639 : void PushBack(uc32 ch) {
640 : DCHECK_LE(c0_, static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode));
641 34 : source_->Back();
642 34 : c0_ = ch;
643 : }
644 :
645 7550196 : uc32 Peek() const { return source_->Peek(); }
646 :
647 : inline Token::Value Select(Token::Value tok) {
648 : Advance();
649 : return tok;
650 : }
651 :
652 4154328 : inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
653 : Advance();
654 4154301 : if (c0_ == next) {
655 : Advance();
656 2932384 : return then;
657 : } else {
658 : return else_;
659 : }
660 : }
661 : // Returns the literal string, if any, for the current token (the
662 : // token last returned by Next()). The string is 0-terminated.
663 : // Literal strings are collected for identifiers, strings, numbers as well
664 : // as for template literals. For template literals we also collect the raw
665 : // form.
666 : // These functions only give the correct result if the literal was scanned
667 : // when a LiteralScope object is alive.
668 : //
669 : // Current usage of these functions is unfortunately a little undisciplined,
670 : // and is_literal_one_byte() + is_literal_one_byte_string() is also
671 : // requested for tokens that do not have a literal. Hence, we treat any
672 : // token as a one-byte literal. E.g. Token::FUNCTION pretends to have a
673 : // literal "function".
674 : Vector<const uint8_t> literal_one_byte_string() const {
675 : DCHECK(current().CanAccessLiteral() || Token::IsKeyword(current().token));
676 : return current().literal_chars.one_byte_literal();
677 : }
678 : Vector<const uint16_t> literal_two_byte_string() const {
679 : DCHECK(current().CanAccessLiteral() || Token::IsKeyword(current().token));
680 : return current().literal_chars.two_byte_literal();
681 : }
682 : bool is_literal_one_byte() const {
683 : DCHECK(current().CanAccessLiteral() || Token::IsKeyword(current().token));
684 : return current().literal_chars.is_one_byte();
685 : }
686 : // Returns the literal string for the next token (the token that
687 : // would be returned if Next() were called).
688 : Vector<const uint8_t> next_literal_one_byte_string() const {
689 : DCHECK(next().CanAccessLiteral());
690 : return next().literal_chars.one_byte_literal();
691 : }
692 : Vector<const uint16_t> next_literal_two_byte_string() const {
693 : DCHECK(next().CanAccessLiteral());
694 : return next().literal_chars.two_byte_literal();
695 : }
696 : bool is_next_literal_one_byte() const {
697 : DCHECK(next().CanAccessLiteral());
698 : return next().literal_chars.is_one_byte();
699 : }
700 : Vector<const uint8_t> raw_literal_one_byte_string() const {
701 : DCHECK(current().CanAccessRawLiteral());
702 : return current().raw_literal_chars.one_byte_literal();
703 : }
704 : Vector<const uint16_t> raw_literal_two_byte_string() const {
705 : DCHECK(current().CanAccessRawLiteral());
706 : return current().raw_literal_chars.two_byte_literal();
707 : }
708 : bool is_raw_literal_one_byte() const {
709 : DCHECK(current().CanAccessRawLiteral());
710 : return current().raw_literal_chars.is_one_byte();
711 : }
712 :
713 : template <bool capture_raw, bool unicode = false>
714 : uc32 ScanHexNumber(int expected_length);
715 : // Scan a number of any length but not bigger than max_value. For example, the
716 : // number can be 000000001, so it's very long in characters but its value is
717 : // small.
718 : template <bool capture_raw>
719 : uc32 ScanUnlimitedLengthHexNumber(int max_value, int beg_pos);
720 :
721 : // Scans a single JavaScript token.
722 : V8_INLINE Token::Value ScanSingleToken();
723 : V8_INLINE void Scan();
724 : // Performance hack: pass through a pre-calculated "next()" value to avoid
725 : // having to re-calculate it in Scan. You'd think the compiler would be able
726 : // to hoist the next() calculation out of the inlined Scan method, but seems
727 : // that pointer aliasing analysis fails show that this is safe.
728 : V8_INLINE void Scan(TokenDesc* next_desc);
729 :
730 : V8_INLINE Token::Value SkipWhiteSpace();
731 : Token::Value SkipSingleHTMLComment();
732 : Token::Value SkipSingleLineComment();
733 : Token::Value SkipSourceURLComment();
734 : void TryToParseSourceURLComment();
735 : Token::Value SkipMultiLineComment();
736 : // Scans a possible HTML comment -- begins with '<!'.
737 : Token::Value ScanHtmlComment();
738 :
739 : bool ScanDigitsWithNumericSeparators(bool (*predicate)(uc32 ch),
740 : bool is_check_first_digit);
741 : bool ScanDecimalDigits();
742 : // Optimized function to scan decimal number as Smi.
743 : bool ScanDecimalAsSmi(uint64_t* value);
744 : bool ScanDecimalAsSmiWithNumericSeparators(uint64_t* value);
745 : bool ScanHexDigits();
746 : bool ScanBinaryDigits();
747 : bool ScanSignedInteger();
748 : bool ScanOctalDigits();
749 : bool ScanImplicitOctalDigits(int start_pos, NumberKind* kind);
750 :
751 : Token::Value ScanNumber(bool seen_period);
752 : V8_INLINE Token::Value ScanIdentifierOrKeyword();
753 : V8_INLINE Token::Value ScanIdentifierOrKeywordInner();
754 : Token::Value ScanIdentifierOrKeywordInnerSlow(bool escaped,
755 : bool can_be_keyword);
756 :
757 : Token::Value ScanString();
758 : Token::Value ScanPrivateName();
759 :
760 : // Scans an escape-sequence which is part of a string and adds the
761 : // decoded character to the current literal. Returns true if a pattern
762 : // is scanned.
763 : template <bool capture_raw>
764 : bool ScanEscape();
765 :
766 : // Decodes a Unicode escape-sequence which is part of an identifier.
767 : // If the escape sequence cannot be decoded the result is kBadChar.
768 : uc32 ScanIdentifierUnicodeEscape();
769 : // Helper for the above functions.
770 : template <bool capture_raw>
771 : uc32 ScanUnicodeEscape();
772 :
773 : Token::Value ScanTemplateSpan();
774 :
775 : // Return the current source position.
776 : int source_pos() {
777 1404752826 : return static_cast<int>(source_->pos()) - kCharacterLookaheadBufferSize;
778 : }
779 :
780 : static bool LiteralContainsEscapes(const TokenDesc& token) {
781 42204871 : Location location = token.location;
782 42204871 : int source_length = (location.end_pos - location.beg_pos);
783 42204871 : if (token.token == Token::STRING) {
784 : // Subtract delimiters.
785 2890 : source_length -= 2;
786 : }
787 519693 : return token.literal_chars.length() != source_length;
788 : }
789 :
790 : #ifdef DEBUG
791 : void SanityCheckTokenDesc(const TokenDesc&) const;
792 : #endif
793 :
794 : TokenDesc& next() { return *next_; }
795 :
796 : const TokenDesc& current() const { return *current_; }
797 : const TokenDesc& next() const { return *next_; }
798 : const TokenDesc& next_next() const { return *next_next_; }
799 :
800 : TokenDesc* current_; // desc for current token (as returned by Next())
801 : TokenDesc* next_; // desc for next token (one token look-ahead)
802 : TokenDesc* next_next_; // desc for the token after next (after PeakAhead())
803 :
804 : // Input stream. Must be initialized to an Utf16CharacterStream.
805 : Utf16CharacterStream* const source_;
806 :
807 : // One Unicode character look-ahead; c0_ < 0 at the end of the input.
808 : uc32 c0_;
809 :
810 : TokenDesc token_storage_[3];
811 :
812 : // Whether this scanner encountered an HTML comment.
813 : bool found_html_comment_;
814 :
815 : // Harmony flags to allow ESNext features.
816 : bool allow_harmony_private_fields_;
817 : bool allow_harmony_numeric_separator_;
818 :
819 : const bool is_module_;
820 :
821 : // Values parsed from magic comments.
822 : LiteralBuffer source_url_;
823 : LiteralBuffer source_mapping_url_;
824 :
825 : // Last-seen positions of potentially problematic tokens.
826 : Location octal_pos_;
827 : MessageTemplate octal_message_;
828 :
829 : MessageTemplate scanner_error_;
830 : Location scanner_error_location_;
831 : };
832 :
833 : } // namespace internal
834 : } // namespace v8
835 :
836 : #endif // V8_PARSING_SCANNER_H_
|