Line data Source code
1 : // Copyright 2011 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : // Features shared by parsing and pre-parsing scanners.
6 :
7 : #ifndef V8_PARSING_SCANNER_H_
8 : #define V8_PARSING_SCANNER_H_
9 :
10 : #include <algorithm>
11 :
12 : #include "src/allocation.h"
13 : #include "src/base/logging.h"
14 : #include "src/char-predicates.h"
15 : #include "src/globals.h"
16 : #include "src/message-template.h"
17 : #include "src/parsing/token.h"
18 : #include "src/pointer-with-payload.h"
19 : #include "src/unicode-decoder.h"
20 : #include "src/unicode.h"
21 :
22 : namespace v8 {
23 : namespace internal {
24 :
25 : class AstRawString;
26 : class AstValueFactory;
27 : class ExternalOneByteString;
28 : class ExternalTwoByteString;
29 : class ParserRecorder;
30 : class RuntimeCallStats;
31 : class Zone;
32 :
33 : // ---------------------------------------------------------------------
34 : // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
35 : // A code unit is a 16 bit value representing either a 16 bit code point
36 : // or one part of a surrogate pair that make a single 21 bit code point.
37 : class Utf16CharacterStream {
38 : public:
39 : static const uc32 kEndOfInput = -1;
40 :
41 2960213 : virtual ~Utf16CharacterStream() = default;
42 :
43 : V8_INLINE void set_parser_error() {
44 707643 : buffer_cursor_ = buffer_end_;
45 707643 : has_parser_error_ = true;
46 : }
47 48183 : V8_INLINE void reset_parser_error_flag() { has_parser_error_ = false; }
48 : V8_INLINE bool has_parser_error() const { return has_parser_error_; }
49 :
50 820489479 : inline uc32 Peek() {
51 820489479 : if (V8_LIKELY(buffer_cursor_ < buffer_end_)) {
52 813783523 : return static_cast<uc32>(*buffer_cursor_);
53 6706216 : } else if (ReadBlockChecked()) {
54 4431519 : return static_cast<uc32>(*buffer_cursor_);
55 : } else {
56 : return kEndOfInput;
57 : }
58 : }
59 :
60 : // Returns and advances past the next UTF-16 code unit in the input
61 : // stream. If there are no more code units it returns kEndOfInput.
62 : inline uc32 Advance() {
63 813917704 : uc32 result = Peek();
64 813928232 : buffer_cursor_++;
65 : return result;
66 : }
67 :
68 : // Returns and advances past the next UTF-16 code unit in the input stream
69 : // that meets the checks requirement. If there are no more code units it
70 : // returns kEndOfInput.
71 : template <typename FunctionType>
72 : V8_INLINE uc32 AdvanceUntil(FunctionType check) {
73 : while (true) {
74 : auto next_cursor_pos =
75 : std::find_if(buffer_cursor_, buffer_end_, [&check](uint16_t raw_c0_) {
76 938916665 : uc32 c0_ = static_cast<uc32>(raw_c0_);
77 938916665 : return check(c0_);
78 150042590 : });
79 :
80 150093338 : if (next_cursor_pos == buffer_end_) {
81 2276691 : buffer_cursor_ = buffer_end_;
82 2276691 : if (!ReadBlockChecked()) {
83 114706 : buffer_cursor_++;
84 : return kEndOfInput;
85 : }
86 : } else {
87 147816647 : buffer_cursor_ = next_cursor_pos + 1;
88 147816647 : return static_cast<uc32>(*next_cursor_pos);
89 : }
90 : }
91 : }
92 :
93 : // Go back one by one character in the input stream.
94 : // This undoes the most recent Advance().
95 5413339 : inline void Back() {
96 : // The common case - if the previous character is within
97 : // buffer_start_ .. buffer_end_ will be handles locally.
98 : // Otherwise, a new block is requested.
99 5413339 : if (V8_LIKELY(buffer_cursor_ > buffer_start_)) {
100 5209414 : buffer_cursor_--;
101 : } else {
102 203925 : ReadBlockAt(pos() - 1);
103 : }
104 5413339 : }
105 :
106 : inline size_t pos() const {
107 1378305715 : return buffer_pos_ + (buffer_cursor_ - buffer_start_);
108 : }
109 :
110 114197 : inline void Seek(size_t pos) {
111 114197 : if (V8_LIKELY(pos >= buffer_pos_ &&
112 : pos < (buffer_pos_ + (buffer_end_ - buffer_start_)))) {
113 96487 : buffer_cursor_ = buffer_start_ + (pos - buffer_pos_);
114 : } else {
115 : ReadBlockAt(pos);
116 : }
117 114197 : }
118 :
119 : // Returns true if the stream could access the V8 heap after construction.
120 66 : bool can_be_cloned_for_parallel_access() const {
121 66 : return can_be_cloned() && !can_access_heap();
122 : }
123 :
124 : // Returns true if the stream can be cloned with Clone.
125 : // TODO(rmcilroy): Remove this once ChunkedStreams can be cloned.
126 : virtual bool can_be_cloned() const = 0;
127 :
128 : // Clones the character stream to enable another independent scanner to access
129 : // the same underlying stream.
130 : virtual std::unique_ptr<Utf16CharacterStream> Clone() const = 0;
131 :
132 : // Returns true if the stream could access the V8 heap after construction.
133 : virtual bool can_access_heap() const = 0;
134 :
135 : RuntimeCallStats* runtime_call_stats() const { return runtime_call_stats_; }
136 : void set_runtime_call_stats(RuntimeCallStats* runtime_call_stats) {
137 13179 : runtime_call_stats_ = runtime_call_stats;
138 : }
139 :
140 : protected:
141 : Utf16CharacterStream(const uint16_t* buffer_start,
142 : const uint16_t* buffer_cursor,
143 : const uint16_t* buffer_end, size_t buffer_pos)
144 : : buffer_start_(buffer_start),
145 : buffer_cursor_(buffer_cursor),
146 : buffer_end_(buffer_end),
147 2960224 : buffer_pos_(buffer_pos) {}
148 : Utf16CharacterStream() : Utf16CharacterStream(nullptr, nullptr, nullptr, 0) {}
149 :
150 9204258 : bool ReadBlockChecked() {
151 : size_t position = pos();
152 : USE(position);
153 9204258 : bool success = !has_parser_error() && ReadBlock();
154 :
155 : // Post-conditions: 1, We should always be at the right position.
156 : // 2, Cursor should be inside the buffer.
157 : // 3, We should have more characters available iff success.
158 : DCHECK_EQ(pos(), position);
159 : DCHECK_LE(buffer_cursor_, buffer_end_);
160 : DCHECK_LE(buffer_start_, buffer_cursor_);
161 : DCHECK_EQ(success, buffer_cursor_ < buffer_end_);
162 2276696 : return success;
163 : }
164 :
165 : void ReadBlockAt(size_t new_pos) {
166 : // The callers of this method (Back/Back2/Seek) should handle the easy
167 : // case (seeking within the current buffer), and we should only get here
168 : // if we actually require new data.
169 : // (This is really an efficiency check, not a correctness invariant.)
170 : DCHECK(new_pos < buffer_pos_ ||
171 : new_pos >= buffer_pos_ + (buffer_end_ - buffer_start_));
172 :
173 : // Change pos() to point to new_pos.
174 221635 : buffer_pos_ = new_pos;
175 221635 : buffer_cursor_ = buffer_start_;
176 : DCHECK_EQ(pos(), new_pos);
177 : ReadBlockChecked();
178 : }
179 :
180 : // Read more data, and update buffer_*_ to point to it.
181 : // Returns true if more data was available.
182 : //
183 : // ReadBlock() may modify any of the buffer_*_ members, but must sure that
184 : // the result of pos() remains unaffected.
185 : //
186 : // Examples:
187 : // - a stream could either fill a separate buffer. Then buffer_start_ and
188 : // buffer_cursor_ would point to the beginning of the buffer, and
189 : // buffer_pos would be the old pos().
190 : // - a stream with existing buffer chunks would set buffer_start_ and
191 : // buffer_end_ to cover the full chunk, and then buffer_cursor_ would
192 : // point into the middle of the buffer, while buffer_pos_ would describe
193 : // the start of the buffer.
194 : virtual bool ReadBlock() = 0;
195 :
196 : const uint16_t* buffer_start_;
197 : const uint16_t* buffer_cursor_;
198 : const uint16_t* buffer_end_;
199 : size_t buffer_pos_;
200 : RuntimeCallStats* runtime_call_stats_;
201 : bool has_parser_error_ = false;
202 : };
203 :
204 : // ----------------------------------------------------------------------------
205 : // JavaScript Scanner.
206 :
207 5919019 : class Scanner {
208 : public:
209 : // Scoped helper for a re-settable bookmark.
210 : class BookmarkScope {
211 : public:
212 2510283 : explicit BookmarkScope(Scanner* scanner)
213 : : scanner_(scanner),
214 : bookmark_(kNoBookmark),
215 5020566 : had_parser_error_(scanner->has_parser_error()) {
216 : DCHECK_NOT_NULL(scanner_);
217 : }
218 : ~BookmarkScope() = default;
219 :
220 : void Set(size_t bookmark);
221 : void Apply();
222 : bool HasBeenSet() const;
223 : bool HasBeenApplied() const;
224 :
225 : private:
226 : static const size_t kNoBookmark;
227 : static const size_t kBookmarkWasApplied;
228 :
229 : Scanner* scanner_;
230 : size_t bookmark_;
231 : bool had_parser_error_;
232 :
233 : DISALLOW_COPY_AND_ASSIGN(BookmarkScope);
234 : };
235 :
236 : // Sets the Scanner into an error state to stop further scanning and terminate
237 : // the parsing by only returning ILLEGAL tokens after that.
238 3998416 : V8_INLINE void set_parser_error() {
239 3998416 : if (!has_parser_error()) {
240 707643 : c0_ = kEndOfInput;
241 : source_->set_parser_error();
242 707643 : for (TokenDesc& desc : token_storage_) desc.token = Token::ILLEGAL;
243 : }
244 : }
245 : V8_INLINE void reset_parser_error_flag() {
246 : source_->reset_parser_error_flag();
247 : }
248 : V8_INLINE bool has_parser_error() const {
249 39832046 : return source_->has_parser_error();
250 : }
251 :
252 : // Representation of an interval of source positions.
253 : struct Location {
254 52011489 : Location(int b, int e) : beg_pos(b), end_pos(e) { }
255 234999313 : Location() : beg_pos(0), end_pos(0) { }
256 :
257 2616208 : int length() const { return end_pos - beg_pos; }
258 510723008 : bool IsValid() const { return IsInRange(beg_pos, 0, end_pos); }
259 :
260 : static Location invalid() { return Location(-1, 0); }
261 :
262 : int beg_pos;
263 : int end_pos;
264 : };
265 :
266 : // -1 is outside of the range of any real source code.
267 : static const int kNoOctalLocation = -1;
268 : static const uc32 kEndOfInput = Utf16CharacterStream::kEndOfInput;
269 :
270 : explicit Scanner(Utf16CharacterStream* source, bool is_module);
271 :
272 : void Initialize();
273 :
274 : // Returns the next token and advances input.
275 : Token::Value Next();
276 : // Returns the token following peek()
277 : Token::Value PeekAhead();
278 : // Returns the current token again.
279 68170220 : Token::Value current_token() const { return current().token; }
280 :
281 : // Returns the location information for the current token
282 : // (the token last returned by Next()).
283 214700760 : const Location& location() const { return current().location; }
284 :
285 : // This error is specifically an invalid hex or unicode escape sequence.
286 : bool has_error() const { return scanner_error_ != MessageTemplate::kNone; }
287 : MessageTemplate error() const { return scanner_error_; }
288 : const Location& error_location() const { return scanner_error_location_; }
289 :
290 150456 : bool has_invalid_template_escape() const {
291 150456 : return current().invalid_template_escape_message != MessageTemplate::kNone;
292 : }
293 : MessageTemplate invalid_template_escape_message() const {
294 : DCHECK(has_invalid_template_escape());
295 : return current().invalid_template_escape_message;
296 : }
297 :
298 : void clear_invalid_template_escape_message() {
299 : DCHECK(has_invalid_template_escape());
300 12980 : current_->invalid_template_escape_message = MessageTemplate::kNone;
301 : }
302 :
303 : Location invalid_template_escape_location() const {
304 : DCHECK(has_invalid_template_escape());
305 6344 : return current().invalid_template_escape_location;
306 : }
307 :
308 : // Similar functions for the upcoming token.
309 :
310 : // One token look-ahead (past the token returned by Next()).
311 2201953547 : Token::Value peek() const { return next().token; }
312 :
313 626699903 : const Location& peek_location() const { return next().location; }
314 :
315 51040253 : bool literal_contains_escapes() const {
316 51040253 : return LiteralContainsEscapes(current());
317 : }
318 :
319 504055 : bool next_literal_contains_escapes() const {
320 504055 : return LiteralContainsEscapes(next());
321 : }
322 :
323 : const AstRawString* CurrentSymbol(AstValueFactory* ast_value_factory) const;
324 :
325 : const AstRawString* NextSymbol(AstValueFactory* ast_value_factory) const;
326 : const AstRawString* CurrentRawSymbol(
327 : AstValueFactory* ast_value_factory) const;
328 :
329 : double DoubleValue();
330 :
331 : const char* CurrentLiteralAsCString(Zone* zone) const;
332 :
333 : inline bool CurrentMatches(Token::Value token) const {
334 : DCHECK(Token::IsKeyword(token));
335 : return current().token == token;
336 : }
337 :
338 : template <size_t N>
339 2619607 : bool NextLiteralEquals(const char (&s)[N]) {
340 : DCHECK_EQ(Token::STRING, peek());
341 : // The length of the token is used to make sure the literal equals without
342 : // taking escape sequences (e.g., "use \x73trict") or line continuations
343 : // (e.g., "use \(newline) strict") into account.
344 2619607 : if (!is_next_literal_one_byte()) return false;
345 2616208 : if (peek_location().length() != N + 1) return false;
346 :
347 : Vector<const uint8_t> next = next_literal_one_byte_string();
348 : const char* chars = reinterpret_cast<const char*>(next.start());
349 429217 : return next.length() == N - 1 && strncmp(s, chars, N - 1) == 0;
350 : }
351 :
352 : // Returns the location of the last seen octal literal.
353 : Location octal_position() const { return octal_pos_; }
354 : void clear_octal_position() {
355 2865 : octal_pos_ = Location::invalid();
356 2865 : octal_message_ = MessageTemplate::kNone;
357 : }
358 : MessageTemplate octal_message() const { return octal_message_; }
359 :
360 : // Returns the value of the last smi that was scanned.
361 21031575 : uint32_t smi_value() const { return current().smi_value_; }
362 :
363 : // Seek forward to the given position. This operation does not
364 : // work in general, for instance when there are pushed back
365 : // characters, but works for seeking forward until simple delimiter
366 : // tokens, which is what it is used for.
367 : void SeekForward(int pos);
368 :
369 : // Returns true if there was a line terminator before the peek'ed token,
370 : // possibly inside a multi-line comment.
371 123355642 : bool HasLineTerminatorBeforeNext() const {
372 123355642 : return next().after_line_terminator;
373 : }
374 :
375 161731 : bool HasLineTerminatorAfterNext() {
376 161731 : Token::Value ensure_next_next = PeekAhead();
377 : USE(ensure_next_next);
378 161731 : return next_next().after_line_terminator;
379 : }
380 :
381 : // Scans the input as a regular expression pattern, next token must be /(=).
382 : // Returns true if a pattern is scanned.
383 : bool ScanRegExpPattern();
384 : // Scans the input as regular expression flags. Returns the flags on success.
385 : Maybe<RegExp::Flags> ScanRegExpFlags();
386 :
387 : // Scans the input as a template literal
388 : Token::Value ScanTemplateContinuation() {
389 : DCHECK_EQ(next().token, Token::RBRACE);
390 : DCHECK_EQ(source_pos() - 1, next().location.beg_pos);
391 78315 : return ScanTemplateSpan();
392 : }
393 :
394 : Handle<String> SourceUrl(Isolate* isolate) const;
395 : Handle<String> SourceMappingUrl(Isolate* isolate) const;
396 :
397 : bool FoundHtmlComment() const { return found_html_comment_; }
398 :
399 : bool allow_harmony_private_fields() const {
400 : return allow_harmony_private_fields_;
401 : }
402 : void set_allow_harmony_private_fields(bool allow) {
403 3353304 : allow_harmony_private_fields_ = allow;
404 : }
405 : bool allow_harmony_numeric_separator() const {
406 : return allow_harmony_numeric_separator_;
407 : }
408 : void set_allow_harmony_numeric_separator(bool allow) {
409 2957739 : allow_harmony_numeric_separator_ = allow;
410 : }
411 :
412 : const Utf16CharacterStream* stream() const { return source_; }
413 :
414 : // If the next characters in the stream are "#!", the line is skipped.
415 : void SkipHashBang();
416 :
417 : private:
418 : // Scoped helper for saving & restoring scanner error state.
419 : // This is used for tagged template literals, in which normally forbidden
420 : // escape sequences are allowed.
421 : class ErrorState;
422 :
423 : // LiteralBuffer - Collector of chars of literals.
424 : class LiteralBuffer {
425 : public:
426 23679624 : LiteralBuffer() : backing_store_(), position_(0), is_one_byte_(true) {}
427 :
428 : ~LiteralBuffer() { backing_store_.Dispose(); }
429 :
430 : V8_INLINE void AddChar(char code_unit) {
431 : DCHECK(IsValidAscii(code_unit));
432 787139304 : AddOneByteChar(static_cast<byte>(code_unit));
433 : }
434 :
435 252549002 : V8_INLINE void AddChar(uc32 code_unit) {
436 252549002 : if (is_one_byte()) {
437 251299358 : if (code_unit <= static_cast<uc32>(unibrow::Latin1::kMaxChar)) {
438 251222336 : AddOneByteChar(static_cast<byte>(code_unit));
439 : return;
440 : }
441 77022 : ConvertToTwoByte();
442 : }
443 1312880 : AddTwoByteChar(code_unit);
444 : }
445 :
446 252489858 : bool is_one_byte() const { return is_one_byte_; }
447 :
448 : bool Equals(Vector<const char> keyword) const {
449 : return is_one_byte() && keyword.length() == position_ &&
450 : (memcmp(keyword.start(), backing_store_.start(), position_) == 0);
451 : }
452 :
453 : Vector<const uint16_t> two_byte_literal() const {
454 : DCHECK(!is_one_byte());
455 : DCHECK_EQ(position_ & 0x1, 0);
456 : return Vector<const uint16_t>(
457 76307 : reinterpret_cast<const uint16_t*>(backing_store_.start()),
458 76307 : position_ >> 1);
459 : }
460 :
461 77050357 : Vector<const uint8_t> one_byte_literal() const {
462 : DCHECK(is_one_byte());
463 : return Vector<const uint8_t>(
464 211449656 : reinterpret_cast<const uint8_t*>(backing_store_.start()), position_);
465 : }
466 :
467 54980929 : int length() const { return is_one_byte() ? position_ : (position_ >> 1); }
468 :
469 131400530 : void Start() {
470 175316092 : position_ = 0;
471 175316092 : is_one_byte_ = true;
472 131400530 : }
473 :
474 : Handle<String> Internalize(Isolate* isolate) const;
475 :
476 : private:
477 : static const int kInitialCapacity = 16;
478 : static const int kGrowthFactor = 4;
479 : static const int kMaxGrowth = 1 * MB;
480 :
481 : inline bool IsValidAscii(char code_unit) {
482 : // Control characters and printable characters span the range of
483 : // valid ASCII characters (0-127). Chars are unsigned on some
484 : // platforms which causes compiler warnings if the validity check
485 : // tests the lower bound >= 0 as it's always true.
486 : return iscntrl(code_unit) || isprint(code_unit);
487 : }
488 :
489 : V8_INLINE void AddOneByteChar(byte one_byte_char) {
490 : DCHECK(is_one_byte());
491 1038379936 : if (position_ >= backing_store_.length()) ExpandBuffer();
492 1038377326 : backing_store_[position_] = one_byte_char;
493 1038377634 : position_ += kOneByteSize;
494 : }
495 :
496 : void AddTwoByteChar(uc32 code_unit);
497 : int NewCapacity(int min_capacity);
498 : void ExpandBuffer();
499 : void ConvertToTwoByte();
500 :
501 : Vector<byte> backing_store_;
502 : int position_;
503 :
504 : bool is_one_byte_;
505 :
506 : DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
507 : };
508 :
509 : // The current and look-ahead token.
510 35513789 : struct TokenDesc {
511 : Location location = {0, 0};
512 : LiteralBuffer literal_chars;
513 : LiteralBuffer raw_literal_chars;
514 : Token::Value token = Token::UNINITIALIZED;
515 : MessageTemplate invalid_template_escape_message = MessageTemplate::kNone;
516 : Location invalid_template_escape_location;
517 : uint32_t smi_value_ = 0;
518 : bool after_line_terminator = false;
519 :
520 : #ifdef DEBUG
521 : bool CanAccessLiteral() const {
522 : return token == Token::PRIVATE_NAME || token == Token::ILLEGAL ||
523 : token == Token::UNINITIALIZED || token == Token::REGEXP_LITERAL ||
524 : IsInRange(token, Token::NUMBER, Token::STRING) ||
525 : (Token::IsAnyIdentifier(token) && !Token::IsKeyword(token)) ||
526 : IsInRange(token, Token::TEMPLATE_SPAN, Token::TEMPLATE_TAIL);
527 : }
528 : bool CanAccessRawLiteral() const {
529 : return token == Token::ILLEGAL || token == Token::UNINITIALIZED ||
530 : IsInRange(token, Token::TEMPLATE_SPAN, Token::TEMPLATE_TAIL);
531 : }
532 : #endif // DEBUG
533 : };
534 :
535 : enum NumberKind {
536 : BINARY,
537 : OCTAL,
538 : IMPLICIT_OCTAL,
539 : HEX,
540 : DECIMAL,
541 : DECIMAL_WITH_LEADING_ZERO
542 : };
543 :
544 : static const int kCharacterLookaheadBufferSize = 1;
545 : static const int kMaxAscii = 127;
546 :
547 : // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
548 : template <bool capture_raw>
549 : uc32 ScanOctalEscape(uc32 c, int length);
550 :
551 : // Call this after setting source_ to the input.
552 2959368 : void Init() {
553 : // Set c0_ (one character ahead)
554 : STATIC_ASSERT(kCharacterLookaheadBufferSize == 1);
555 : Advance();
556 :
557 2959509 : current_ = &token_storage_[0];
558 2959509 : next_ = &token_storage_[1];
559 2959509 : next_next_ = &token_storage_[2];
560 :
561 2959509 : found_html_comment_ = false;
562 2959509 : scanner_error_ = MessageTemplate::kNone;
563 2959509 : }
564 :
565 10030 : void ReportScannerError(const Location& location, MessageTemplate error) {
566 10030 : if (has_error()) return;
567 10181 : scanner_error_ = error;
568 10181 : scanner_error_location_ = location;
569 : }
570 :
571 225139 : void ReportScannerError(int pos, MessageTemplate error) {
572 225139 : if (has_error()) return;
573 111386 : scanner_error_ = error;
574 111386 : scanner_error_location_ = Location(pos, pos + 1);
575 : }
576 :
577 : // Seek to the next_ token at the given position.
578 : void SeekNext(size_t position);
579 :
580 250510337 : V8_INLINE void AddLiteralChar(uc32 c) { next().literal_chars.AddChar(c); }
581 :
582 787187033 : V8_INLINE void AddLiteralChar(char c) { next().literal_chars.AddChar(c); }
583 :
584 : V8_INLINE void AddRawLiteralChar(uc32 c) {
585 1923414 : next().raw_literal_chars.AddChar(c);
586 : }
587 :
588 : V8_INLINE void AddLiteralCharAdvance() {
589 101554 : AddLiteralChar(c0_);
590 17641998 : Advance();
591 : }
592 :
593 : // Low-level scanning support.
594 : template <bool capture_raw = false>
595 429509032 : void Advance() {
596 : if (capture_raw) {
597 35598 : AddRawLiteralChar(c0_);
598 : }
599 1216526920 : c0_ = source_->Advance();
600 429520779 : }
601 :
602 : template <typename FunctionType>
603 : V8_INLINE void AdvanceUntil(FunctionType check) {
604 147931273 : c0_ = source_->AdvanceUntil(check);
605 : }
606 :
607 4835738 : bool CombineSurrogatePair() {
608 : DCHECK(!unibrow::Utf16::IsLeadSurrogate(kEndOfInput));
609 9671476 : if (unibrow::Utf16::IsLeadSurrogate(c0_)) {
610 174 : uc32 c1 = source_->Advance();
611 : DCHECK(!unibrow::Utf16::IsTrailSurrogate(kEndOfInput));
612 174 : if (unibrow::Utf16::IsTrailSurrogate(c1)) {
613 294 : c0_ = unibrow::Utf16::CombineSurrogatePair(c0_, c1);
614 : return true;
615 : }
616 27 : source_->Back();
617 : }
618 : return false;
619 : }
620 :
621 : void PushBack(uc32 ch) {
622 : DCHECK_LE(c0_, static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode));
623 33 : source_->Back();
624 33 : c0_ = ch;
625 : }
626 :
627 7402179 : uc32 Peek() const { return source_->Peek(); }
628 :
629 160867975 : inline Token::Value Select(Token::Value tok) {
630 : Advance();
631 160869178 : return tok;
632 : }
633 :
634 6918018 : inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
635 : Advance();
636 4047558 : if (c0_ == next) {
637 : Advance();
638 2870428 : return then;
639 : } else {
640 : return else_;
641 : }
642 : }
643 : // Returns the literal string, if any, for the current token (the
644 : // token last returned by Next()). The string is 0-terminated.
645 : // Literal strings are collected for identifiers, strings, numbers as well
646 : // as for template literals. For template literals we also collect the raw
647 : // form.
648 : // These functions only give the correct result if the literal was scanned
649 : // when a LiteralScope object is alive.
650 : //
651 : // Current usage of these functions is unfortunately a little undisciplined,
652 : // and is_literal_one_byte() + is_literal_one_byte_string() is also
653 : // requested for tokens that do not have a literal. Hence, we treat any
654 : // token as a one-byte literal. E.g. Token::FUNCTION pretends to have a
655 : // literal "function".
656 1263018 : Vector<const uint8_t> literal_one_byte_string() const {
657 : DCHECK(current().CanAccessLiteral() || Token::IsKeyword(current().token));
658 : return current().literal_chars.one_byte_literal();
659 : }
660 : Vector<const uint16_t> literal_two_byte_string() const {
661 : DCHECK(current().CanAccessLiteral() || Token::IsKeyword(current().token));
662 : return current().literal_chars.two_byte_literal();
663 : }
664 99715910 : bool is_literal_one_byte() const {
665 : DCHECK(current().CanAccessLiteral() || Token::IsKeyword(current().token));
666 99715910 : return current().literal_chars.is_one_byte();
667 : }
668 : // Returns the literal string for the next token (the token that
669 : // would be returned if Next() were called).
670 : Vector<const uint8_t> next_literal_one_byte_string() const {
671 : DCHECK(next().CanAccessLiteral());
672 : return next().literal_chars.one_byte_literal();
673 : }
674 : Vector<const uint16_t> next_literal_two_byte_string() const {
675 : DCHECK(next().CanAccessLiteral());
676 : return next().literal_chars.two_byte_literal();
677 : }
678 3172771 : bool is_next_literal_one_byte() const {
679 : DCHECK(next().CanAccessLiteral());
680 3172771 : return next().literal_chars.is_one_byte();
681 : }
682 : Vector<const uint8_t> raw_literal_one_byte_string() const {
683 : DCHECK(current().CanAccessRawLiteral());
684 : return current().raw_literal_chars.one_byte_literal();
685 : }
686 : Vector<const uint16_t> raw_literal_two_byte_string() const {
687 : DCHECK(current().CanAccessRawLiteral());
688 : return current().raw_literal_chars.two_byte_literal();
689 : }
690 82293 : bool is_raw_literal_one_byte() const {
691 : DCHECK(current().CanAccessRawLiteral());
692 82293 : return current().raw_literal_chars.is_one_byte();
693 : }
694 :
695 : template <bool capture_raw, bool unicode = false>
696 : uc32 ScanHexNumber(int expected_length);
697 : // Scan a number of any length but not bigger than max_value. For example, the
698 : // number can be 000000001, so it's very long in characters but its value is
699 : // small.
700 : template <bool capture_raw>
701 : uc32 ScanUnlimitedLengthHexNumber(int max_value, int beg_pos);
702 :
703 : // Scans a single JavaScript token.
704 : V8_INLINE Token::Value ScanSingleToken();
705 : V8_INLINE void Scan();
706 : // Performance hack: pass through a pre-calculated "next()" value to avoid
707 : // having to re-calculate it in Scan. You'd think the compiler would be able
708 : // to hoist the next() calculation out of the inlined Scan method, but seems
709 : // that pointer aliasing analysis fails show that this is safe.
710 : V8_INLINE void Scan(TokenDesc* next_desc);
711 :
712 : V8_INLINE Token::Value SkipWhiteSpace();
713 : Token::Value SkipSingleHTMLComment();
714 : Token::Value SkipSingleLineComment();
715 : Token::Value SkipSourceURLComment();
716 : void TryToParseSourceURLComment();
717 : Token::Value SkipMultiLineComment();
718 : // Scans a possible HTML comment -- begins with '<!'.
719 : Token::Value ScanHtmlComment();
720 :
721 : bool ScanDigitsWithNumericSeparators(bool (*predicate)(uc32 ch),
722 : bool is_check_first_digit);
723 : bool ScanDecimalDigits();
724 : // Optimized function to scan decimal number as Smi.
725 : bool ScanDecimalAsSmi(uint64_t* value);
726 : bool ScanDecimalAsSmiWithNumericSeparators(uint64_t* value);
727 : bool ScanHexDigits();
728 : bool ScanBinaryDigits();
729 : bool ScanSignedInteger();
730 : bool ScanOctalDigits();
731 : bool ScanImplicitOctalDigits(int start_pos, NumberKind* kind);
732 :
733 : Token::Value ScanNumber(bool seen_period);
734 : V8_INLINE Token::Value ScanIdentifierOrKeyword();
735 : V8_INLINE Token::Value ScanIdentifierOrKeywordInner();
736 : Token::Value ScanIdentifierOrKeywordInnerSlow(bool escaped,
737 : bool can_be_keyword);
738 :
739 : Token::Value ScanString();
740 : Token::Value ScanPrivateName();
741 :
742 : // Scans an escape-sequence which is part of a string and adds the
743 : // decoded character to the current literal. Returns true if a pattern
744 : // is scanned.
745 : template <bool capture_raw>
746 : bool ScanEscape();
747 :
748 : // Decodes a Unicode escape-sequence which is part of an identifier.
749 : // If the escape sequence cannot be decoded the result is kBadChar.
750 : uc32 ScanIdentifierUnicodeEscape();
751 : // Helper for the above functions.
752 : template <bool capture_raw>
753 77913 : uc32 ScanUnicodeEscape();
754 :
755 : Token::Value ScanTemplateSpan();
756 :
757 : // Return the current source position.
758 1294567304 : int source_pos() {
759 2648555308 : return static_cast<int>(source_->pos()) - kCharacterLookaheadBufferSize;
760 : }
761 :
762 : static bool LiteralContainsEscapes(const TokenDesc& token) {
763 51544308 : Location location = token.location;
764 51544308 : int source_length = (location.end_pos - location.beg_pos);
765 51544308 : if (token.token == Token::STRING) {
766 : // Subtract delimiters.
767 38672 : source_length -= 2;
768 : }
769 51544308 : return token.literal_chars.length() != source_length;
770 : }
771 :
772 : #ifdef DEBUG
773 : void SanityCheckTokenDesc(const TokenDesc&) const;
774 : #endif
775 :
776 2197810620 : TokenDesc& next() { return *next_; }
777 :
778 : const TokenDesc& current() const { return *current_; }
779 : const TokenDesc& next() const { return *next_; }
780 : const TokenDesc& next_next() const { return *next_next_; }
781 :
782 : TokenDesc* current_; // desc for current token (as returned by Next())
783 : TokenDesc* next_; // desc for next token (one token look-ahead)
784 : TokenDesc* next_next_; // desc for the token after next (after PeakAhead())
785 :
786 : // Input stream. Must be initialized to an Utf16CharacterStream.
787 : Utf16CharacterStream* const source_;
788 :
789 : // One Unicode character look-ahead; c0_ < 0 at the end of the input.
790 : uc32 c0_;
791 :
792 : TokenDesc token_storage_[3];
793 :
794 : // Whether this scanner encountered an HTML comment.
795 : bool found_html_comment_;
796 :
797 : // Harmony flags to allow ESNext features.
798 : bool allow_harmony_private_fields_;
799 : bool allow_harmony_numeric_separator_;
800 :
801 : const bool is_module_;
802 :
803 : // Values parsed from magic comments.
804 : LiteralBuffer source_url_;
805 : LiteralBuffer source_mapping_url_;
806 :
807 : // Last-seen positions of potentially problematic tokens.
808 : Location octal_pos_;
809 : MessageTemplate octal_message_;
810 :
811 : MessageTemplate scanner_error_;
812 : Location scanner_error_location_;
813 : };
814 :
815 : } // namespace internal
816 : } // namespace v8
817 :
818 : #endif // V8_PARSING_SCANNER_H_
|