Line data Source code
1 : // Copyright 2011 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : // Features shared by parsing and pre-parsing scanners.
6 :
7 : #include "src/parsing/scanner.h"
8 :
9 : #include <stdint.h>
10 :
11 : #include <cmath>
12 :
13 : #include "src/ast/ast-value-factory.h"
14 : #include "src/conversions-inl.h"
15 : #include "src/objects/bigint.h"
16 : #include "src/parsing/scanner-inl.h"
17 : #include "src/zone/zone.h"
18 :
19 : namespace v8 {
20 : namespace internal {
21 :
22 : class Scanner::ErrorState {
23 : public:
24 : ErrorState(MessageTemplate* message_stack, Scanner::Location* location_stack)
25 : : message_stack_(message_stack),
26 : old_message_(*message_stack),
27 : location_stack_(location_stack),
28 314214 : old_location_(*location_stack) {
29 314214 : *message_stack_ = MessageTemplate::kNone;
30 314214 : *location_stack_ = Location::invalid();
31 : }
32 :
33 : ~ErrorState() {
34 314226 : *message_stack_ = old_message_;
35 314226 : *location_stack_ = old_location_;
36 : }
37 :
38 : void MoveErrorTo(TokenDesc* dest) {
39 30292 : if (*message_stack_ == MessageTemplate::kNone) {
40 : return;
41 : }
42 13109 : if (dest->invalid_template_escape_message == MessageTemplate::kNone) {
43 13109 : dest->invalid_template_escape_message = *message_stack_;
44 13109 : dest->invalid_template_escape_location = *location_stack_;
45 : }
46 13109 : *message_stack_ = MessageTemplate::kNone;
47 13109 : *location_stack_ = Location::invalid();
48 : }
49 :
50 : private:
51 : MessageTemplate* const message_stack_;
52 : MessageTemplate const old_message_;
53 : Scanner::Location* const location_stack_;
54 : Scanner::Location const old_location_;
55 : };
56 :
57 : // ----------------------------------------------------------------------------
58 : // Scanner::LiteralBuffer
59 :
60 3972 : Handle<String> Scanner::LiteralBuffer::Internalize(Isolate* isolate) const {
61 3972 : if (is_one_byte()) {
62 3972 : return isolate->factory()->InternalizeOneByteString(one_byte_literal());
63 : }
64 0 : return isolate->factory()->InternalizeTwoByteString(two_byte_literal());
65 : }
66 :
67 0 : int Scanner::LiteralBuffer::NewCapacity(int min_capacity) {
68 : return min_capacity < (kMaxGrowth / (kGrowthFactor - 1))
69 : ? min_capacity * kGrowthFactor
70 5740317 : : min_capacity + kMaxGrowth;
71 : }
72 :
73 5738001 : void Scanner::LiteralBuffer::ExpandBuffer() {
74 11476162 : int min_capacity = Max(kInitialCapacity, backing_store_.length());
75 : Vector<byte> new_store = Vector<byte>::New(NewCapacity(min_capacity));
76 5738161 : MemCopy(new_store.start(), backing_store_.start(), position_);
77 : backing_store_.Dispose();
78 5738162 : backing_store_ = new_store;
79 5738162 : }
80 :
81 78175 : void Scanner::LiteralBuffer::ConvertToTwoByte() {
82 : DCHECK(is_one_byte());
83 : Vector<byte> new_store;
84 78175 : int new_content_size = position_ * kUC16Size;
85 312700 : if (new_content_size >= backing_store_.length()) {
86 : // Ensure room for all currently read code units as UC16 as well
87 : // as the code unit about to be stored.
88 : new_store = Vector<byte>::New(NewCapacity(new_content_size));
89 : } else {
90 75859 : new_store = backing_store_;
91 : }
92 : uint8_t* src = backing_store_.start();
93 : uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start());
94 195099 : for (int i = position_ - 1; i >= 0; i--) {
95 116924 : dst[i] = src[i];
96 : }
97 78175 : if (new_store.start() != backing_store_.start()) {
98 : backing_store_.Dispose();
99 2316 : backing_store_ = new_store;
100 : }
101 78175 : position_ = new_content_size;
102 78175 : is_one_byte_ = false;
103 78175 : }
104 :
105 1330823 : void Scanner::LiteralBuffer::AddTwoByteChar(uc32 code_unit) {
106 : DCHECK(!is_one_byte());
107 4047665 : if (position_ >= backing_store_.length()) ExpandBuffer();
108 1332427 : if (code_unit <=
109 : static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
110 2611264 : *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
111 1305632 : position_ += kUC16Size;
112 : } else {
113 26795 : *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
114 53590 : unibrow::Utf16::LeadSurrogate(code_unit);
115 26795 : position_ += kUC16Size;
116 26795 : if (position_ >= backing_store_.length()) ExpandBuffer();
117 26797 : *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
118 26797 : unibrow::Utf16::TrailSurrogate(code_unit);
119 26797 : position_ += kUC16Size;
120 : }
121 1332429 : }
122 :
123 : // ----------------------------------------------------------------------------
124 : // Scanner::BookmarkScope
125 :
126 : const size_t Scanner::BookmarkScope::kNoBookmark =
127 : std::numeric_limits<size_t>::max() - 1;
128 : const size_t Scanner::BookmarkScope::kBookmarkWasApplied =
129 : std::numeric_limits<size_t>::max();
130 :
131 2446248 : void Scanner::BookmarkScope::Set(size_t position) {
132 : DCHECK_EQ(bookmark_, kNoBookmark);
133 2446248 : bookmark_ = position;
134 2446248 : }
135 :
136 43007 : void Scanner::BookmarkScope::Apply() {
137 : DCHECK(HasBeenSet()); // Caller hasn't called SetBookmark.
138 43007 : if (had_parser_error_) {
139 43007 : scanner_->set_parser_error();
140 : } else {
141 43007 : scanner_->reset_parser_error_flag();
142 43007 : scanner_->SeekNext(bookmark_);
143 : }
144 43007 : bookmark_ = kBookmarkWasApplied;
145 43007 : }
146 :
147 0 : bool Scanner::BookmarkScope::HasBeenSet() const {
148 0 : return bookmark_ != kNoBookmark && bookmark_ != kBookmarkWasApplied;
149 : }
150 :
151 0 : bool Scanner::BookmarkScope::HasBeenApplied() const {
152 0 : return bookmark_ == kBookmarkWasApplied;
153 : }
154 :
155 : // ----------------------------------------------------------------------------
156 : // Scanner
157 :
158 2952079 : Scanner::Scanner(Utf16CharacterStream* source, bool is_module)
159 : : source_(source),
160 : found_html_comment_(false),
161 : allow_harmony_numeric_separator_(false),
162 : is_module_(is_module),
163 : octal_pos_(Location::invalid()),
164 14760377 : octal_message_(MessageTemplate::kNone) {
165 : DCHECK_NOT_NULL(source);
166 2952079 : }
167 :
168 5904176 : void Scanner::Initialize() {
169 : // Need to capture identifiers in order to recognize "get" and "set"
170 : // in object literals.
171 2952048 : Init();
172 2952128 : next().after_line_terminator = true;
173 : Scan();
174 2952113 : }
175 :
176 : template <bool capture_raw, bool unicode>
177 85770439 : uc32 Scanner::ScanHexNumber(int expected_length) {
178 : DCHECK_LE(expected_length, 4); // prevent overflow
179 :
180 28548069 : int begin = source_pos() - 2;
181 : uc32 x = 0;
182 85759900 : for (int i = 0; i < expected_length; i++) {
183 57229998 : int d = HexValue(c0_);
184 57229998 : if (d < 0) {
185 : ReportScannerError(Location(begin, begin + expected_length + 2),
186 : unicode
187 : ? MessageTemplate::kInvalidUnicodeEscapeSequence
188 1148 : : MessageTemplate::kInvalidHexEscapeSequence);
189 : return -1;
190 : }
191 57228850 : x = x * 16 + d;
192 6480 : Advance<capture_raw>();
193 : }
194 :
195 : return x;
196 : }
197 :
198 : template <bool capture_raw>
199 214168 : uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value, int beg_pos) {
200 : uc32 x = 0;
201 42626 : int d = HexValue(c0_);
202 42626 : if (d < 0) return -1;
203 :
204 216289 : while (d >= 0) {
205 178448 : x = x * 16 + d;
206 178448 : if (x > max_value) {
207 : ReportScannerError(Location(beg_pos, source_pos() + 1),
208 : MessageTemplate::kUndefinedUnicodeCodePoint);
209 : return -1;
210 : }
211 6906 : Advance<capture_raw>();
212 6906 : d = HexValue(c0_);
213 : }
214 :
215 : return x;
216 : }
217 :
218 1153703464 : Token::Value Scanner::Next() {
219 : // Rotate through tokens.
220 384571695 : TokenDesc* previous = current_;
221 384571695 : current_ = next_;
222 : // Either we already have the next token lined up, in which case next_next_
223 : // simply becomes next_. In that case we use current_ as new next_next_ and
224 : // clear its token to indicate that it wasn't scanned yet. Otherwise we use
225 : // current_ as next_ and scan into it, leaving next_next_ uninitialized.
226 384571695 : if (V8_LIKELY(next_next().token == Token::UNINITIALIZED)) {
227 381998267 : next_ = previous;
228 : // User 'previous' instead of 'next_' because for some reason the compiler
229 : // thinks 'next_' could be modified before the entry into Scan.
230 381998267 : previous->after_line_terminator = false;
231 : Scan(previous);
232 : } else {
233 2573428 : next_ = next_next_;
234 2573428 : next_next_ = previous;
235 2573428 : previous->token = Token::UNINITIALIZED;
236 : DCHECK_NE(Token::UNINITIALIZED, current().token);
237 : }
238 384560074 : return current().token;
239 : }
240 :
241 2084943 : Token::Value Scanner::PeekAhead() {
242 : DCHECK(next().token != Token::DIV);
243 : DCHECK(next().token != Token::ASSIGN_DIV);
244 :
245 2084943 : if (next_next().token != Token::UNINITIALIZED) {
246 : return next_next().token;
247 : }
248 1917456 : TokenDesc* temp = next_;
249 1917456 : next_ = next_next_;
250 1917456 : next().after_line_terminator = false;
251 : Scan();
252 1917895 : next_next_ = next_;
253 1917895 : next_ = temp;
254 1917895 : return next_next().token;
255 : }
256 :
257 198 : Token::Value Scanner::SkipSingleHTMLComment() {
258 193 : if (is_module_) {
259 : ReportScannerError(source_pos(), MessageTemplate::kHtmlCommentInModule);
260 : return Token::ILLEGAL;
261 : }
262 188 : return SkipSingleLineComment();
263 : }
264 :
265 7027004 : Token::Value Scanner::SkipSingleLineComment() {
266 : // The line terminator at the end of the line is not considered
267 : // to be part of the single-line comment; it is recognized
268 : // separately by the lexical grammar and becomes part of the
269 : // stream of input elements for the syntactic grammar (see
270 : // ECMA-262, section 7.4).
271 710148546 : AdvanceUntil([](uc32 c0_) { return unibrow::IsLineTerminator(c0_); });
272 :
273 7034045 : return Token::WHITESPACE;
274 : }
275 :
276 4686 : Token::Value Scanner::SkipSourceURLComment() {
277 4074 : TryToParseSourceURLComment();
278 9692 : while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
279 : Advance();
280 : }
281 :
282 4074 : return Token::WHITESPACE;
283 : }
284 :
285 90582 : void Scanner::TryToParseSourceURLComment() {
286 : // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
287 : // function will just return if it cannot parse a magic comment.
288 : DCHECK(!IsWhiteSpaceOrLineTerminator(kEndOfInput));
289 8215 : if (!IsWhiteSpace(c0_)) return;
290 : Advance();
291 : LiteralBuffer name;
292 : name.Start();
293 :
294 124390 : while (c0_ != kEndOfInput && !IsWhiteSpaceOrLineTerminator(c0_) &&
295 41450 : c0_ != '=') {
296 37405 : name.AddChar(c0_);
297 : Advance();
298 : }
299 4065 : if (!name.is_one_byte()) return;
300 : Vector<const uint8_t> name_literal = name.one_byte_literal();
301 : LiteralBuffer* value;
302 4065 : if (name_literal == StaticCharVector("sourceURL")) {
303 3907 : value = &source_url_;
304 158 : } else if (name_literal == StaticCharVector("sourceMappingURL")) {
305 130 : value = &source_mapping_url_;
306 : } else {
307 : return;
308 : }
309 4037 : if (c0_ != '=')
310 : return;
311 : value->Start();
312 : Advance();
313 8094 : while (IsWhiteSpace(c0_)) {
314 : Advance();
315 : }
316 86219 : while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
317 : // Disallowed characters.
318 41001 : if (c0_ == '"' || c0_ == '\'') {
319 : value->Start();
320 : return;
321 : }
322 40981 : if (IsWhiteSpace(c0_)) {
323 : break;
324 : }
325 40941 : value->AddChar(c0_);
326 : Advance();
327 : }
328 : // Allow whitespace at the end.
329 4387 : while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
330 70 : if (!IsWhiteSpace(c0_)) {
331 : value->Start();
332 : break;
333 : }
334 : Advance();
335 : }
336 : }
337 :
338 6823945 : Token::Value Scanner::SkipMultiLineComment() {
339 : DCHECK_EQ(c0_, '*');
340 : Advance();
341 :
342 6580815 : while (c0_ != kEndOfInput) {
343 : DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
344 6648451 : if (!HasLineTerminatorBeforeNext() && unibrow::IsLineTerminator(c0_)) {
345 : // Following ECMA-262, section 7.4, a comment containing
346 : // a newline will make the comment count as a line-terminator.
347 464 : next().after_line_terminator = true;
348 : }
349 :
350 6711344 : while (V8_UNLIKELY(c0_ == '*')) {
351 : Advance();
352 186746 : if (c0_ == '/') {
353 : Advance();
354 56306 : return Token::WHITESPACE;
355 : }
356 : }
357 : Advance();
358 : }
359 :
360 : // Unterminated multi-line comment.
361 : return Token::ILLEGAL;
362 : }
363 :
364 14296 : void Scanner::SkipHashBang() {
365 14296 : if (c0_ == '#' && Peek() == '!' && source_pos() == 0) {
366 160 : SkipSingleLineComment();
367 : Scan();
368 : }
369 10936 : }
370 :
371 202 : Token::Value Scanner::ScanHtmlComment() {
372 : // Check for <!-- comments.
373 : DCHECK_EQ(c0_, '!');
374 : Advance();
375 129 : if (c0_ != '-' || Peek() != '-') {
376 : PushBack('!'); // undo Advance()
377 34 : return Token::LT;
378 : }
379 : Advance();
380 :
381 39 : found_html_comment_ = true;
382 39 : return SkipSingleHTMLComment();
383 : }
384 :
385 : #ifdef DEBUG
386 : void Scanner::SanityCheckTokenDesc(const TokenDesc& token) const {
387 : // Only TEMPLATE_* tokens can have a invalid_template_escape_message.
388 : // ILLEGAL and UNINITIALIZED can have garbage for the field.
389 :
390 : switch (token.token) {
391 : case Token::UNINITIALIZED:
392 : case Token::ILLEGAL:
393 : // token.literal_chars & other members might be garbage. That's ok.
394 : case Token::TEMPLATE_SPAN:
395 : case Token::TEMPLATE_TAIL:
396 : break;
397 : default:
398 : DCHECK_EQ(token.invalid_template_escape_message, MessageTemplate::kNone);
399 : break;
400 : }
401 : }
402 : #endif // DEBUG
403 :
404 249463 : void Scanner::SeekForward(int pos) {
405 : // After this call, we will have the token at the given position as
406 : // the "next" token. The "current" token will be invalid.
407 124744 : if (pos == next().location.beg_pos) return;
408 : int current_pos = source_pos();
409 : DCHECK_EQ(next().location.end_pos, current_pos);
410 : // Positions inside the lookahead token aren't supported.
411 : DCHECK(pos >= current_pos);
412 62367 : if (pos != current_pos) {
413 62362 : source_->Seek(pos);
414 : Advance();
415 : // This function is only called to seek to the location
416 : // of the end of a function (at the "}" token). It doesn't matter
417 : // whether there was a line terminator in the part we skip.
418 62362 : next().after_line_terminator = false;
419 : }
420 : Scan();
421 : }
422 :
423 : template <bool capture_raw>
424 58329233 : bool Scanner::ScanEscape() {
425 29172181 : uc32 c = c0_;
426 15147 : Advance<capture_raw>();
427 :
428 : // Skip escaped newlines.
429 : DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
430 58323814 : if (!capture_raw && unibrow::IsLineTerminator(c)) {
431 : // Allow escaped CR+LF newlines in multiline string literals.
432 11071 : if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
433 : return true;
434 : }
435 :
436 29165983 : switch (c) {
437 : case '\'': // fall through
438 : case '"' : // fall through
439 : case '\\': break;
440 60 : case 'b' : c = '\b'; break;
441 108 : case 'f' : c = '\f'; break;
442 466528 : case 'n' : c = '\n'; break;
443 755 : case 'r' : c = '\r'; break;
444 411 : case 't' : c = '\t'; break;
445 : case 'u' : {
446 105170 : c = ScanUnicodeEscape<capture_raw>();
447 105167 : if (c < 0) return false;
448 : break;
449 : }
450 : case 'v':
451 : c = '\v';
452 48 : break;
453 : case 'x': {
454 28463199 : c = ScanHexNumber<capture_raw>(2);
455 28453604 : if (c < 0) return false;
456 : break;
457 : }
458 : case '0': // Fall through.
459 : case '1': // fall through
460 : case '2': // fall through
461 : case '3': // fall through
462 : case '4': // fall through
463 : case '5': // fall through
464 : case '6': // fall through
465 : case '7':
466 4265 : c = ScanOctalEscape<capture_raw>(c, 2);
467 4265 : break;
468 : }
469 :
470 : // Other escaped characters are interpreted as their non-escaped version.
471 : AddLiteralChar(c);
472 : return true;
473 : }
474 :
475 : template <bool capture_raw>
476 9682 : uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
477 4265 : uc32 x = c - '0';
478 : int i = 0;
479 6740 : for (; i < length; i++) {
480 6147 : int d = c0_ - '0';
481 6147 : if (d < 0 || d > 7) break;
482 2493 : int nx = x * 8 + d;
483 2493 : if (nx >= 256) break;
484 : x = nx;
485 885 : Advance<capture_raw>();
486 : }
487 : // Anything except '\0' is an octal escape sequence, illegal in strict mode.
488 : // Remember the position of octal escape sequences so that an error
489 : // can be reported later (in strict mode).
490 : // We don't report the error immediately, because the octal escape can
491 : // occur before the "use strict" directive.
492 5299 : if (c != '0' || i > 0 || IsNonOctalDecimalDigit(c0_)) {
493 3827 : octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
494 3827 : octal_message_ = capture_raw ? MessageTemplate::kTemplateOctalLiteral
495 : : MessageTemplate::kStrictOctalEscape;
496 : }
497 4265 : return x;
498 : }
499 :
500 78524676 : Token::Value Scanner::ScanString() {
501 10078990 : uc32 quote = c0_;
502 : Advance(); // consume quote
503 :
504 : next().literal_chars.Start();
505 : while (true) {
506 39664651 : if (V8_UNLIKELY(c0_ == kEndOfInput)) return Token::ILLEGAL;
507 79343082 : if ((V8_UNLIKELY(static_cast<uint32_t>(c0_) >= kMaxAscii) &&
508 79313725 : !unibrow::IsStringLiteralLineTerminator(c0_)) ||
509 39649269 : !MayTerminateString(character_scan_flags[c0_])) {
510 : AddLiteralChar(c0_);
511 144612598 : AdvanceUntil([this](uc32 c0) {
512 144612598 : if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) {
513 961945 : if (V8_UNLIKELY(unibrow::IsStringLiteralLineTerminator(c0))) {
514 : return true;
515 : }
516 961683 : AddLiteralChar(c0);
517 : return false;
518 : }
519 143650653 : uint8_t char_flags = character_scan_flags[c0];
520 143650653 : if (MayTerminateString(char_flags)) return true;
521 134604493 : AddLiteralChar(c0);
522 : return false;
523 : });
524 : }
525 39665146 : if (c0_ == quote) {
526 : Advance();
527 10078059 : return Token::STRING;
528 : }
529 29587043 : if (c0_ == '\\') {
530 : Advance();
531 : // TODO(verwaest): Check whether we can remove the additional check.
532 29155504 : if (V8_UNLIKELY(c0_ == kEndOfInput || !ScanEscape<false>())) {
533 : return Token::ILLEGAL;
534 : }
535 : continue;
536 : }
537 854699 : if (V8_UNLIKELY(c0_ == kEndOfInput ||
538 : unibrow::IsStringLiteralLineTerminator(c0_))) {
539 : return Token::ILLEGAL;
540 : }
541 : DCHECK_NE(quote, c0_);
542 : DCHECK((c0_ == '\'' || c0_ == '"'));
543 : AddLiteralCharAdvance();
544 : }
545 : }
546 :
547 620842 : Token::Value Scanner::ScanPrivateName() {
548 260764 : if (!allow_harmony_private_fields()) {
549 : ReportScannerError(source_pos(),
550 : MessageTemplate::kInvalidOrUnexpectedToken);
551 : return Token::ILLEGAL;
552 : }
553 :
554 : next().literal_chars.Start();
555 : DCHECK_EQ(c0_, '#');
556 : DCHECK(!IsIdentifierStart(kEndOfInput));
557 78835 : if (!IsIdentifierStart(Peek())) {
558 : ReportScannerError(source_pos(),
559 : MessageTemplate::kInvalidOrUnexpectedToken);
560 : return Token::ILLEGAL;
561 : }
562 :
563 : AddLiteralCharAdvance();
564 : Token::Value token = ScanIdentifierOrKeywordInner();
565 58357 : return token == Token::ILLEGAL ? Token::ILLEGAL : Token::PRIVATE_NAME;
566 : }
567 :
568 3055145 : Token::Value Scanner::ScanTemplateSpan() {
569 : // When scanning a TemplateSpan, we are looking for the following construct:
570 : // TEMPLATE_SPAN ::
571 : // ` LiteralChars* ${
572 : // | } LiteralChars* ${
573 : //
574 : // TEMPLATE_TAIL ::
575 : // ` LiteralChars* `
576 : // | } LiteralChar* `
577 : //
578 : // A TEMPLATE_SPAN should always be followed by an Expression, while a
579 : // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
580 : // followed by an Expression.
581 :
582 : // These scoped helpers save and restore the original error state, so that we
583 : // can specially treat invalid escape sequences in templates (which are
584 : // handled by the parser).
585 157107 : ErrorState scanner_error_state(&scanner_error_, &scanner_error_location_);
586 157107 : ErrorState octal_error_state(&octal_message_, &octal_pos_);
587 :
588 : Token::Value result = Token::TEMPLATE_SPAN;
589 : next().literal_chars.Start();
590 : next().raw_literal_chars.Start();
591 : const bool capture_raw = true;
592 : while (true) {
593 2095926 : uc32 c = c0_;
594 2095926 : if (c == '`') {
595 : Advance(); // Consume '`'
596 : result = Token::TEMPLATE_TAIL;
597 68103 : break;
598 2115139 : } else if (c == '$' && Peek() == '{') {
599 : Advance(); // Consume '$'
600 : Advance(); // Consume '{'
601 : break;
602 1940539 : } else if (c == '\\') {
603 : Advance(); // Consume '\\'
604 : DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
605 : if (capture_raw) AddRawLiteralChar('\\');
606 30648 : if (unibrow::IsLineTerminator(c0_)) {
607 : // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
608 : // code unit sequence.
609 : uc32 lastChar = c0_;
610 : Advance();
611 177 : if (lastChar == '\r') {
612 : // Also skip \n.
613 102 : if (c0_ == '\n') Advance();
614 : lastChar = '\n';
615 : }
616 : if (capture_raw) AddRawLiteralChar(lastChar);
617 : } else {
618 15147 : bool success = ScanEscape<capture_raw>();
619 : USE(success);
620 : DCHECK_EQ(!success, has_error());
621 : // For templates, invalid escape sequence checking is handled in the
622 : // parser.
623 15146 : scanner_error_state.MoveErrorTo(next_);
624 15146 : octal_error_state.MoveErrorTo(next_);
625 : }
626 1925216 : } else if (c < 0) {
627 : // Unterminated template literal
628 : break;
629 : } else {
630 : Advance(); // Consume c.
631 : // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
632 : // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
633 : // consisting of the CV 0x000A.
634 1923518 : if (c == '\r') {
635 629 : if (c0_ == '\n') Advance(); // Consume '\n'
636 : c = '\n';
637 : }
638 : if (capture_raw) AddRawLiteralChar(c);
639 : AddLiteralChar(c);
640 : }
641 : }
642 157113 : next().location.end_pos = source_pos();
643 157113 : next().token = result;
644 :
645 157113 : return result;
646 : }
647 :
648 1710701 : Handle<String> Scanner::SourceUrl(Isolate* isolate) const {
649 : Handle<String> tmp;
650 1710701 : if (source_url_.length() > 0) {
651 3872 : tmp = source_url_.Internalize(isolate);
652 : }
653 1710701 : return tmp;
654 : }
655 :
656 1710702 : Handle<String> Scanner::SourceMappingUrl(Isolate* isolate) const {
657 : Handle<String> tmp;
658 1710702 : if (source_mapping_url_.length() > 0) {
659 100 : tmp = source_mapping_url_.Internalize(isolate);
660 : }
661 1710702 : return tmp;
662 : }
663 :
664 2696 : bool Scanner::ScanDigitsWithNumericSeparators(bool (*predicate)(uc32 ch),
665 2894 : bool is_check_first_digit) {
666 : // we must have at least one digit after 'x'/'b'/'o'
667 2696 : if (is_check_first_digit && !predicate(c0_)) return false;
668 :
669 : bool separator_seen = false;
670 7861 : while (predicate(c0_) || c0_ == '_') {
671 5777 : if (c0_ == '_') {
672 : Advance();
673 2200 : if (c0_ == '_') {
674 : ReportScannerError(Location(source_pos(), source_pos() + 1),
675 : MessageTemplate::kContinuousNumericSeparator);
676 : return false;
677 : }
678 : separator_seen = true;
679 : continue;
680 : }
681 : separator_seen = false;
682 : AddLiteralCharAdvance();
683 : }
684 :
685 2082 : if (separator_seen) {
686 : ReportScannerError(Location(source_pos(), source_pos() + 1),
687 : MessageTemplate::kTrailingNumericSeparator);
688 : return false;
689 : }
690 :
691 : return true;
692 : }
693 :
694 2223189 : bool Scanner::ScanDecimalDigits() {
695 2223189 : if (allow_harmony_numeric_separator()) {
696 1619 : return ScanDigitsWithNumericSeparators(&IsDecimalDigit, false);
697 : }
698 7613164 : while (IsDecimalDigit(c0_)) {
699 : AddLiteralCharAdvance();
700 : }
701 : return true;
702 : }
703 :
704 59687 : bool Scanner::ScanDecimalAsSmiWithNumericSeparators(uint64_t* value) {
705 : bool separator_seen = false;
706 153473 : while (IsDecimalDigit(c0_) || c0_ == '_') {
707 24615 : if (c0_ == '_') {
708 : Advance();
709 1067 : if (c0_ == '_') {
710 : ReportScannerError(Location(source_pos(), source_pos() + 1),
711 : MessageTemplate::kContinuousNumericSeparator);
712 : return false;
713 : }
714 : separator_seen = true;
715 : continue;
716 : }
717 : separator_seen = false;
718 23549 : *value = 10 * *value + (c0_ - '0');
719 23549 : uc32 first_char = c0_;
720 : Advance();
721 : AddLiteralChar(first_char);
722 : }
723 :
724 34719 : if (separator_seen) {
725 : ReportScannerError(Location(source_pos(), source_pos() + 1),
726 : MessageTemplate::kTrailingNumericSeparator);
727 : return false;
728 : }
729 :
730 : return true;
731 : }
732 :
733 86307153 : bool Scanner::ScanDecimalAsSmi(uint64_t* value) {
734 32820239 : if (allow_harmony_numeric_separator()) {
735 34807 : return ScanDecimalAsSmiWithNumericSeparators(value);
736 : }
737 :
738 172544688 : while (IsDecimalDigit(c0_)) {
739 53486914 : *value = 10 * *value + (c0_ - '0');
740 53486914 : uc32 first_char = c0_;
741 : Advance();
742 : AddLiteralChar(first_char);
743 : }
744 : return true;
745 : }
746 :
747 756 : bool Scanner::ScanBinaryDigits() {
748 756 : if (allow_harmony_numeric_separator()) {
749 359 : return ScanDigitsWithNumericSeparators(&IsBinaryDigit, true);
750 : }
751 :
752 : // we must have at least one binary digit after 'b'/'B'
753 794 : if (!IsBinaryDigit(c0_)) {
754 : return false;
755 : }
756 :
757 1592 : while (IsBinaryDigit(c0_)) {
758 : AddLiteralCharAdvance();
759 : }
760 : return true;
761 : }
762 :
763 767 : bool Scanner::ScanOctalDigits() {
764 767 : if (allow_harmony_numeric_separator()) {
765 359 : return ScanDigitsWithNumericSeparators(&IsOctalDigit, true);
766 : }
767 :
768 : // we must have at least one octal digit after 'o'/'O'
769 816 : if (!IsOctalDigit(c0_)) {
770 : return false;
771 : }
772 :
773 1748 : while (IsOctalDigit(c0_)) {
774 : AddLiteralCharAdvance();
775 : }
776 : return true;
777 : }
778 :
779 196442 : bool Scanner::ScanImplicitOctalDigits(int start_pos,
780 196436 : Scanner::NumberKind* kind) {
781 196442 : *kind = IMPLICIT_OCTAL;
782 :
783 : while (true) {
784 : // (possible) octal number
785 786926 : if (IsNonOctalDecimalDigit(c0_)) {
786 6 : *kind = DECIMAL_WITH_LEADING_ZERO;
787 6 : return true;
788 : }
789 393457 : if (!IsOctalDigit(c0_)) {
790 : // Octal literal finished.
791 196436 : octal_pos_ = Location(start_pos, source_pos());
792 196436 : octal_message_ = MessageTemplate::kStrictOctalLiteral;
793 196436 : return true;
794 : }
795 : AddLiteralCharAdvance();
796 : }
797 : }
798 :
799 533960 : bool Scanner::ScanHexDigits() {
800 533960 : if (allow_harmony_numeric_separator()) {
801 359 : return ScanDigitsWithNumericSeparators(&IsHexDigit, true);
802 : }
803 :
804 : // we must have at least one hex digit after 'x'/'X'
805 1067202 : if (!IsHexDigit(c0_)) {
806 : return false;
807 : }
808 :
809 3714904 : while (IsHexDigit(c0_)) {
810 : AddLiteralCharAdvance();
811 : }
812 : return true;
813 : }
814 :
815 16318 : bool Scanner::ScanSignedInteger() {
816 16318 : if (c0_ == '+' || c0_ == '-') AddLiteralCharAdvance();
817 : // we must have at least one decimal digit after 'e'/'E'
818 32636 : if (!IsDecimalDigit(c0_)) return false;
819 15944 : return ScanDecimalDigits();
820 : }
821 :
822 177355827 : Token::Value Scanner::ScanNumber(bool seen_period) {
823 : DCHECK(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
824 :
825 33555673 : NumberKind kind = DECIMAL;
826 :
827 : next().literal_chars.Start();
828 33555673 : bool at_start = !seen_period;
829 : int start_pos = source_pos(); // For reporting octal positions.
830 33555673 : if (seen_period) {
831 : // we have already seen a decimal point of the float
832 : AddLiteralChar('.');
833 3258 : if (allow_harmony_numeric_separator() && c0_ == '_') {
834 : return Token::ILLEGAL;
835 : }
836 : // we know we have at least one digit
837 3258 : if (!ScanDecimalDigits()) return Token::ILLEGAL;
838 : } else {
839 : // if the first character is '0' we must check for octals and hex
840 33552415 : if (c0_ == '0') {
841 : AddLiteralCharAdvance();
842 :
843 : // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
844 : // an octal number.
845 11891282 : if (c0_ == 'x' || c0_ == 'X') {
846 : AddLiteralCharAdvance();
847 533941 : kind = HEX;
848 533941 : if (!ScanHexDigits()) return Token::ILLEGAL;
849 11357331 : } else if (c0_ == 'o' || c0_ == 'O') {
850 : AddLiteralCharAdvance();
851 767 : kind = OCTAL;
852 767 : if (!ScanOctalDigits()) return Token::ILLEGAL;
853 11356563 : } else if (c0_ == 'b' || c0_ == 'B') {
854 : AddLiteralCharAdvance();
855 756 : kind = BINARY;
856 756 : if (!ScanBinaryDigits()) return Token::ILLEGAL;
857 11355807 : } else if (IsOctalDigit(c0_)) {
858 196442 : kind = IMPLICIT_OCTAL;
859 196442 : if (!ScanImplicitOctalDigits(start_pos, &kind)) {
860 : return Token::ILLEGAL;
861 : }
862 196442 : if (kind == DECIMAL_WITH_LEADING_ZERO) {
863 : at_start = false;
864 : }
865 11159365 : } else if (IsNonOctalDecimalDigit(c0_)) {
866 48818 : kind = DECIMAL_WITH_LEADING_ZERO;
867 11110547 : } else if (allow_harmony_numeric_separator() && c0_ == '_') {
868 : ReportScannerError(Location(source_pos(), source_pos() + 1),
869 : MessageTemplate::kZeroDigitNumericSeparator);
870 : return Token::ILLEGAL;
871 : }
872 : }
873 :
874 : // Parse decimal digits and allow trailing fractional part.
875 33550776 : if (kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO) {
876 : // This is an optimization for parsing Decimal numbers as Smi's.
877 32819948 : if (at_start) {
878 32820005 : uint64_t value = 0;
879 : // scan subsequent decimal digits
880 32820005 : if (!ScanDecimalAsSmi(&value)) {
881 31654295 : return Token::ILLEGAL;
882 : }
883 :
884 65610368 : if (next().literal_chars.one_byte_literal().length() <= 10 &&
885 97187368 : value <= Smi::kMaxValue && c0_ != '.' && !IsIdentifierStart(c0_)) {
886 31654001 : next().smi_value_ = static_cast<uint32_t>(value);
887 :
888 31654001 : if (kind == DECIMAL_WITH_LEADING_ZERO) {
889 48818 : octal_pos_ = Location(start_pos, source_pos());
890 48818 : octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
891 : }
892 : return Token::SMI;
893 : }
894 : }
895 :
896 1166009 : if (!ScanDecimalDigits()) return Token::ILLEGAL;
897 1166060 : if (c0_ == '.') {
898 : seen_period = true;
899 : AddLiteralCharAdvance();
900 1038051 : if (allow_harmony_numeric_separator() && c0_ == '_') {
901 : return Token::ILLEGAL;
902 : }
903 1037967 : if (!ScanDecimalDigits()) return Token::ILLEGAL;
904 : }
905 : }
906 : }
907 :
908 : bool is_bigint = false;
909 3813491 : if (c0_ == 'n' && !seen_period &&
910 13735 : (kind == DECIMAL || kind == HEX || kind == OCTAL || kind == BINARY)) {
911 : // Check that the literal is within our limits for BigInt length.
912 : // For simplicity, use 4 bits per character to calculate the maximum
913 : // allowed literal length.
914 : static const int kMaxBigIntCharacters = BigInt::kMaxLengthBits / 4;
915 13693 : int length = source_pos() - start_pos - (kind != DECIMAL ? 2 : 0);
916 13693 : if (length > kMaxBigIntCharacters) {
917 : ReportScannerError(Location(start_pos, source_pos()),
918 : MessageTemplate::kBigIntTooBig);
919 : return Token::ILLEGAL;
920 : }
921 :
922 : is_bigint = true;
923 : Advance();
924 1886197 : } else if (c0_ == 'e' || c0_ == 'E') {
925 : // scan exponent, if any
926 : DCHECK(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
927 :
928 16318 : if (!(kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO))
929 : return Token::ILLEGAL;
930 :
931 : // scan exponent
932 : AddLiteralCharAdvance();
933 :
934 16318 : if (!ScanSignedInteger()) return Token::ILLEGAL;
935 : }
936 :
937 : // The source character immediately following a numeric literal must
938 : // not be an identifier start or a decimal digit; see ECMA-262
939 : // section 7.8.3, page 17 (note that we read only one decimal digit
940 : // if the value is 0).
941 5698589 : if (IsDecimalDigit(c0_) || IsIdentifierStart(c0_)) {
942 : return Token::ILLEGAL;
943 : }
944 :
945 1895875 : if (kind == DECIMAL_WITH_LEADING_ZERO) {
946 6 : octal_pos_ = Location(start_pos, source_pos());
947 6 : octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
948 : }
949 :
950 1895875 : return is_bigint ? Token::BIGINT : Token::NUMBER;
951 : }
952 :
953 46151 : uc32 Scanner::ScanIdentifierUnicodeEscape() {
954 : Advance();
955 23240 : if (c0_ != 'u') return -1;
956 : Advance();
957 22911 : return ScanUnicodeEscape<false>();
958 : }
959 :
960 : template <bool capture_raw>
961 138588 : uc32 Scanner::ScanUnicodeEscape() {
962 : // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of
963 : // hex digits between { } is arbitrary. \ and u have already been read.
964 128083 : if (c0_ == '{') {
965 42628 : int begin = source_pos() - 2;
966 5634 : Advance<capture_raw>();
967 42627 : uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10FFFF, begin);
968 42619 : if (cp < 0 || c0_ != '}') {
969 : ReportScannerError(source_pos(),
970 : MessageTemplate::kInvalidUnicodeEscapeSequence);
971 : return -1;
972 : }
973 762 : Advance<capture_raw>();
974 36230 : return cp;
975 : }
976 : const bool unicode = true;
977 85455 : return ScanHexNumber<capture_raw, unicode>(4);
978 : }
979 :
980 16014 : Token::Value Scanner::ScanIdentifierOrKeywordInnerSlow(bool escaped,
981 28566 : bool can_be_keyword) {
982 : while (true) {
983 69782 : if (c0_ == '\\') {
984 : escaped = true;
985 13309 : uc32 c = ScanIdentifierUnicodeEscape();
986 : // Only allow legal identifier part characters.
987 : // TODO(verwaest): Make this true.
988 : // DCHECK(!IsIdentifierPart('\'));
989 : DCHECK(!IsIdentifierPart(-1));
990 26618 : if (c == '\\' || !IsIdentifierPart(c)) {
991 : return Token::ILLEGAL;
992 : }
993 25380 : can_be_keyword = can_be_keyword && CharCanBeKeyword(c);
994 : AddLiteralChar(c);
995 128436 : } else if (IsIdentifierPart(c0_) ||
996 15634 : (CombineSurrogatePair() && IsIdentifierPart(c0_))) {
997 80872 : can_be_keyword = can_be_keyword && CharCanBeKeyword(c0_);
998 : AddLiteralCharAdvance();
999 : } else {
1000 : break;
1001 : }
1002 : }
1003 :
1004 29719 : if (can_be_keyword && next().literal_chars.is_one_byte()) {
1005 : Vector<const uint8_t> chars = next().literal_chars.one_byte_literal();
1006 : Token::Value token =
1007 : KeywordOrIdentifierToken(chars.start(), chars.length());
1008 : /* TODO(adamk): YIELD should be handled specially. */
1009 14283 : if (token == Token::FUTURE_STRICT_RESERVED_WORD) {
1010 2280 : if (escaped) return Token::ESCAPED_STRICT_RESERVED_WORD;
1011 0 : return token;
1012 : }
1013 12003 : if (token == Token::IDENTIFIER) return token;
1014 :
1015 11232 : if (!escaped) return token;
1016 :
1017 11232 : if (token == Token::LET || token == Token::STATIC) {
1018 : return Token::ESCAPED_STRICT_RESERVED_WORD;
1019 : }
1020 7980 : return Token::ESCAPED_KEYWORD;
1021 : }
1022 :
1023 : return Token::IDENTIFIER;
1024 : }
1025 :
1026 321779 : bool Scanner::ScanRegExpPattern() {
1027 : DCHECK_EQ(Token::UNINITIALIZED, next_next().token);
1028 : DCHECK(next().token == Token::DIV || next().token == Token::ASSIGN_DIV);
1029 :
1030 : // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1031 : bool in_character_class = false;
1032 :
1033 : // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1034 : // the scanner should pass uninterpreted bodies to the RegExp
1035 : // constructor.
1036 : next().literal_chars.Start();
1037 80558 : if (next().token == Token::ASSIGN_DIV) {
1038 : AddLiteralChar('=');
1039 : }
1040 :
1041 1019477 : while (c0_ != '/' || in_character_class) {
1042 1877965 : if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) {
1043 : return false;
1044 : }
1045 940027 : if (c0_ == '\\') { // Escape sequence.
1046 : AddLiteralCharAdvance();
1047 183910 : if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) {
1048 : return false;
1049 : }
1050 : AddLiteralCharAdvance();
1051 : // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1052 : // only "safe" characters are allowed (letters, digits, underscore),
1053 : // otherwise the escape isn't valid and the invalid character has
1054 : // its normal meaning. I.e., we can just continue scanning without
1055 : // worrying whether the following characters are part of the escape
1056 : // or not, since any '/', '\\' or '[' is guaranteed to not be part
1057 : // of the escape sequence.
1058 :
1059 : // TODO(896): At some point, parse RegExps more thoroughly to capture
1060 : // octal esacpes in strict mode.
1061 : } else { // Unescaped character.
1062 847993 : if (c0_ == '[') in_character_class = true;
1063 847993 : if (c0_ == ']') in_character_class = false;
1064 : AddLiteralCharAdvance();
1065 : }
1066 : }
1067 : Advance(); // consume '/'
1068 :
1069 80330 : next().token = Token::REGEXP_LITERAL;
1070 80330 : return true;
1071 : }
1072 :
1073 :
1074 194256 : Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() {
1075 : DCHECK_EQ(Token::REGEXP_LITERAL, next().token);
1076 :
1077 : // Scan regular expression flags.
1078 : int flags = 0;
1079 309521 : while (IsIdentifierPart(c0_)) {
1080 : RegExp::Flags flag = RegExp::kNone;
1081 35051 : switch (c0_) {
1082 : case 'g':
1083 : flag = RegExp::kGlobal;
1084 : break;
1085 : case 'i':
1086 : flag = RegExp::kIgnoreCase;
1087 4346 : break;
1088 : case 'm':
1089 : flag = RegExp::kMultiline;
1090 590 : break;
1091 : case 's':
1092 : flag = RegExp::kDotAll;
1093 156 : break;
1094 : case 'u':
1095 : flag = RegExp::kUnicode;
1096 7123 : break;
1097 : case 'y':
1098 : flag = RegExp::kSticky;
1099 122 : break;
1100 : default:
1101 : return Nothing<RegExp::Flags>();
1102 : }
1103 34563 : if (flags & flag) {
1104 : return Nothing<RegExp::Flags>();
1105 : }
1106 : Advance();
1107 34440 : flags |= flag;
1108 : }
1109 :
1110 79608 : next().location.end_pos = source_pos();
1111 79608 : return Just(RegExp::Flags(flags));
1112 : }
1113 :
1114 98518009 : const AstRawString* Scanner::CurrentSymbol(
1115 : AstValueFactory* ast_value_factory) const {
1116 98518009 : if (is_literal_one_byte()) {
1117 98484566 : return ast_value_factory->GetOneByteString(literal_one_byte_string());
1118 : }
1119 75285 : return ast_value_factory->GetTwoByteString(literal_two_byte_string());
1120 : }
1121 :
1122 442240 : const AstRawString* Scanner::NextSymbol(
1123 : AstValueFactory* ast_value_factory) const {
1124 442240 : if (is_next_literal_one_byte()) {
1125 440121 : return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
1126 : }
1127 2140 : return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
1128 : }
1129 :
1130 83835 : const AstRawString* Scanner::CurrentRawSymbol(
1131 : AstValueFactory* ast_value_factory) const {
1132 83835 : if (is_raw_literal_one_byte()) {
1133 83801 : return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
1134 : }
1135 36 : return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
1136 : }
1137 :
1138 :
1139 1314765 : double Scanner::DoubleValue() {
1140 : DCHECK(is_literal_one_byte());
1141 : return StringToDouble(
1142 : literal_one_byte_string(),
1143 1314765 : ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
1144 : }
1145 :
1146 12241 : const char* Scanner::CurrentLiteralAsCString(Zone* zone) const {
1147 : DCHECK(is_literal_one_byte());
1148 : Vector<const uint8_t> vector = literal_one_byte_string();
1149 : int length = vector.length();
1150 12241 : char* buffer = zone->NewArray<char>(length + 1);
1151 : memcpy(buffer, vector.start(), length);
1152 12241 : buffer[length] = '\0';
1153 12241 : return buffer;
1154 : }
1155 :
1156 86014 : void Scanner::SeekNext(size_t position) {
1157 : // Use with care: This cleanly resets most, but not all scanner state.
1158 : // TODO(vogelheim): Fix this, or at least DCHECK the relevant conditions.
1159 :
1160 : // To re-scan from a given character position, we need to:
1161 : // 1, Reset the current_, next_ and next_next_ tokens
1162 : // (next_ + next_next_ will be overwrittem by Next(),
1163 : // current_ will remain unchanged, so overwrite it fully.)
1164 172028 : for (TokenDesc& token : token_storage_) {
1165 129021 : token.token = Token::UNINITIALIZED;
1166 129021 : token.invalid_template_escape_message = MessageTemplate::kNone;
1167 : }
1168 : // 2, reset the source to the desired position,
1169 43007 : source_->Seek(position);
1170 : // 3, re-scan, by scanning the look-ahead char + 1 token (next_).
1171 86014 : c0_ = source_->Advance();
1172 43007 : next().after_line_terminator = false;
1173 : Scan();
1174 : DCHECK_EQ(next().location.beg_pos, static_cast<int>(position));
1175 43007 : }
1176 :
1177 : } // namespace internal
1178 183867 : } // namespace v8
|