Line data Source code
1 : // Copyright 2011 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : // Features shared by parsing and pre-parsing scanners.
6 :
7 : #include "src/parsing/scanner.h"
8 :
9 : #include <stdint.h>
10 :
11 : #include <cmath>
12 :
13 : #include "src/ast/ast-value-factory.h"
14 : #include "src/conversions-inl.h"
15 : #include "src/objects/bigint.h"
16 : #include "src/parsing/scanner-inl.h"
17 : #include "src/zone/zone.h"
18 :
19 : namespace v8 {
20 : namespace internal {
21 :
22 : class Scanner::ErrorState {
23 : public:
24 : ErrorState(MessageTemplate* message_stack, Scanner::Location* location_stack)
25 : : message_stack_(message_stack),
26 : old_message_(*message_stack),
27 : location_stack_(location_stack),
28 308328 : old_location_(*location_stack) {
29 308328 : *message_stack_ = MessageTemplate::kNone;
30 308328 : *location_stack_ = Location::invalid();
31 : }
32 :
33 : ~ErrorState() {
34 308406 : *message_stack_ = old_message_;
35 308406 : *location_stack_ = old_location_;
36 : }
37 :
38 : void MoveErrorTo(TokenDesc* dest) {
39 30030 : if (*message_stack_ == MessageTemplate::kNone) {
40 : return;
41 : }
42 13078 : if (dest->invalid_template_escape_message == MessageTemplate::kNone) {
43 13078 : dest->invalid_template_escape_message = *message_stack_;
44 13078 : dest->invalid_template_escape_location = *location_stack_;
45 : }
46 13078 : *message_stack_ = MessageTemplate::kNone;
47 13078 : *location_stack_ = Location::invalid();
48 : }
49 :
50 : private:
51 : MessageTemplate* const message_stack_;
52 : MessageTemplate const old_message_;
53 : Scanner::Location* const location_stack_;
54 : Scanner::Location const old_location_;
55 : };
56 :
57 : // ----------------------------------------------------------------------------
58 : // Scanner::LiteralBuffer
59 :
60 3903 : Handle<String> Scanner::LiteralBuffer::Internalize(Isolate* isolate) const {
61 3903 : if (is_one_byte()) {
62 3903 : return isolate->factory()->InternalizeOneByteString(one_byte_literal());
63 : }
64 0 : return isolate->factory()->InternalizeTwoByteString(two_byte_literal());
65 : }
66 :
67 0 : int Scanner::LiteralBuffer::NewCapacity(int min_capacity) {
68 : return min_capacity < (kMaxGrowth / (kGrowthFactor - 1))
69 : ? min_capacity * kGrowthFactor
70 5743276 : : min_capacity + kMaxGrowth;
71 : }
72 :
73 5740957 : void Scanner::LiteralBuffer::ExpandBuffer() {
74 5886313 : int min_capacity = Max(kInitialCapacity, backing_store_.length());
75 : Vector<byte> new_store = Vector<byte>::New(NewCapacity(min_capacity));
76 5741282 : if (position_ > 0) {
77 145356 : MemCopy(new_store.start(), backing_store_.start(), position_);
78 : }
79 : backing_store_.Dispose();
80 5741285 : backing_store_ = new_store;
81 5741285 : }
82 :
83 77019 : void Scanner::LiteralBuffer::ConvertToTwoByte() {
84 : DCHECK(is_one_byte());
85 : Vector<byte> new_store;
86 77019 : int new_content_size = position_ * kUC16Size;
87 308086 : if (new_content_size >= backing_store_.length()) {
88 : // Ensure room for all currently read code units as UC16 as well
89 : // as the code unit about to be stored.
90 : new_store = Vector<byte>::New(NewCapacity(new_content_size));
91 : } else {
92 74700 : new_store = backing_store_;
93 : }
94 : uint8_t* src = backing_store_.start();
95 : uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start());
96 192549 : for (int i = position_ - 1; i >= 0; i--) {
97 115525 : dst[i] = src[i];
98 : }
99 77024 : if (new_store.start() != backing_store_.start()) {
100 : backing_store_.Dispose();
101 2319 : backing_store_ = new_store;
102 : }
103 77024 : position_ = new_content_size;
104 77024 : is_one_byte_ = false;
105 77024 : }
106 :
107 1307150 : void Scanner::LiteralBuffer::AddTwoByteChar(uc32 code_unit) {
108 : DCHECK(!is_one_byte());
109 3975549 : if (position_ >= backing_store_.length()) ExpandBuffer();
110 1308056 : if (code_unit <=
111 : static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
112 2562920 : *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
113 1281460 : position_ += kUC16Size;
114 : } else {
115 26596 : *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
116 53192 : unibrow::Utf16::LeadSurrogate(code_unit);
117 26596 : position_ += kUC16Size;
118 26596 : if (position_ >= backing_store_.length()) ExpandBuffer();
119 26597 : *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
120 26597 : unibrow::Utf16::TrailSurrogate(code_unit);
121 26597 : position_ += kUC16Size;
122 : }
123 1308057 : }
124 :
125 : // ----------------------------------------------------------------------------
126 : // Scanner::BookmarkScope
127 :
128 : const size_t Scanner::BookmarkScope::kNoBookmark =
129 : std::numeric_limits<size_t>::max() - 1;
130 : const size_t Scanner::BookmarkScope::kBookmarkWasApplied =
131 : std::numeric_limits<size_t>::max();
132 :
133 2510259 : void Scanner::BookmarkScope::Set(size_t position) {
134 : DCHECK_EQ(bookmark_, kNoBookmark);
135 2510259 : bookmark_ = position;
136 2510259 : }
137 :
138 48183 : void Scanner::BookmarkScope::Apply() {
139 : DCHECK(HasBeenSet()); // Caller hasn't called SetBookmark.
140 48183 : if (had_parser_error_) {
141 48183 : scanner_->set_parser_error();
142 : } else {
143 48183 : scanner_->reset_parser_error_flag();
144 48183 : scanner_->SeekNext(bookmark_);
145 : }
146 48182 : bookmark_ = kBookmarkWasApplied;
147 48182 : }
148 :
149 0 : bool Scanner::BookmarkScope::HasBeenSet() const {
150 0 : return bookmark_ != kNoBookmark && bookmark_ != kBookmarkWasApplied;
151 : }
152 :
153 0 : bool Scanner::BookmarkScope::HasBeenApplied() const {
154 0 : return bookmark_ == kBookmarkWasApplied;
155 : }
156 :
157 : // ----------------------------------------------------------------------------
158 : // Scanner
159 :
160 2959452 : Scanner::Scanner(Utf16CharacterStream* source, bool is_module)
161 : : source_(source),
162 : found_html_comment_(false),
163 : allow_harmony_numeric_separator_(false),
164 : is_module_(is_module),
165 : octal_pos_(Location::invalid()),
166 14797266 : octal_message_(MessageTemplate::kNone) {
167 : DCHECK_NOT_NULL(source);
168 2959452 : }
169 :
170 5918890 : void Scanner::Initialize() {
171 : // Need to capture identifiers in order to recognize "get" and "set"
172 : // in object literals.
173 2959385 : Init();
174 2959505 : next().after_line_terminator = true;
175 : Scan();
176 2959492 : }
177 :
178 : template <bool capture_raw, bool unicode>
179 76631466 : uc32 Scanner::ScanHexNumber(int expected_length) {
180 : DCHECK_LE(expected_length, 4); // prevent overflow
181 :
182 25495531 : int begin = source_pos() - 2;
183 : uc32 x = 0;
184 76633690 : for (int i = 0; i < expected_length; i++) {
185 51149175 : int d = HexValue(c0_);
186 51149175 : if (d < 0) {
187 : ReportScannerError(Location(begin, begin + expected_length + 2),
188 : unicode
189 : ? MessageTemplate::kInvalidUnicodeEscapeSequence
190 6812 : : MessageTemplate::kInvalidHexEscapeSequence);
191 : return -1;
192 : }
193 51142363 : x = x * 16 + d;
194 6428 : Advance<capture_raw>();
195 : }
196 :
197 : return x;
198 : }
199 :
200 : template <bool capture_raw>
201 217470 : uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value, int beg_pos) {
202 : uc32 x = 0;
203 44583 : int d = HexValue(c0_);
204 44583 : if (d < 0) return -1;
205 :
206 219573 : while (d >= 0) {
207 179776 : x = x * 16 + d;
208 179776 : if (x > max_value) {
209 : ReportScannerError(Location(beg_pos, source_pos() + 1),
210 : MessageTemplate::kUndefinedUnicodeCodePoint);
211 : return -1;
212 : }
213 6889 : Advance<capture_raw>();
214 6889 : d = HexValue(c0_);
215 : }
216 :
217 : return x;
218 : }
219 :
220 1160868529 : Token::Value Scanner::Next() {
221 : // Rotate through tokens.
222 386970098 : TokenDesc* previous = current_;
223 386970098 : current_ = next_;
224 : // Either we already have the next token lined up, in which case next_next_
225 : // simply becomes next_. In that case we use current_ as new next_next_ and
226 : // clear its token to indicate that it wasn't scanned yet. Otherwise we use
227 : // current_ as next_ and scan into it, leaving next_next_ uninitialized.
228 386970098 : if (V8_LIKELY(next_next().token == Token::UNINITIALIZED)) {
229 384370546 : next_ = previous;
230 : // User 'previous' instead of 'next_' because for some reason the compiler
231 : // thinks 'next_' could be modified before the entry into Scan.
232 384370546 : previous->after_line_terminator = false;
233 : Scan(previous);
234 : } else {
235 2599552 : next_ = next_next_;
236 2599552 : next_next_ = previous;
237 2599552 : previous->token = Token::UNINITIALIZED;
238 : DCHECK_NE(Token::UNINITIALIZED, current().token);
239 : }
240 386928333 : return current().token;
241 : }
242 :
243 2084657 : Token::Value Scanner::PeekAhead() {
244 : DCHECK(next().token != Token::DIV);
245 : DCHECK(next().token != Token::ASSIGN_DIV);
246 :
247 2084657 : if (next_next().token != Token::UNINITIALIZED) {
248 : return next_next().token;
249 : }
250 1913246 : TokenDesc* temp = next_;
251 1913246 : next_ = next_next_;
252 1913246 : next().after_line_terminator = false;
253 : Scan();
254 1913648 : next_next_ = next_;
255 1913648 : next_ = temp;
256 1913648 : return next_next().token;
257 : }
258 :
259 198 : Token::Value Scanner::SkipSingleHTMLComment() {
260 193 : if (is_module_) {
261 : ReportScannerError(source_pos(), MessageTemplate::kHtmlCommentInModule);
262 : return Token::ILLEGAL;
263 : }
264 188 : return SkipSingleLineComment();
265 : }
266 :
267 7119478 : Token::Value Scanner::SkipSingleLineComment() {
268 : // The line terminator at the end of the line is not considered
269 : // to be part of the single-line comment; it is recognized
270 : // separately by the lexical grammar and becomes part of the
271 : // stream of input elements for the syntactic grammar (see
272 : // ECMA-262, section 7.4).
273 364750189 : AdvanceUntil([](uc32 c0_) { return unibrow::IsLineTerminator(c0_); });
274 :
275 7124144 : return Token::WHITESPACE;
276 : }
277 :
278 4617 : Token::Value Scanner::SkipSourceURLComment() {
279 4005 : TryToParseSourceURLComment();
280 9553 : while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
281 : Advance();
282 : }
283 :
284 4005 : return Token::WHITESPACE;
285 : }
286 :
287 88557 : void Scanner::TryToParseSourceURLComment() {
288 : // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
289 : // function will just return if it cannot parse a magic comment.
290 : DCHECK(!IsWhiteSpaceOrLineTerminator(kEndOfInput));
291 8077 : if (!IsWhiteSpace(c0_)) return;
292 : Advance();
293 : LiteralBuffer name;
294 : name.Start();
295 :
296 122320 : while (c0_ != kEndOfInput && !IsWhiteSpaceOrLineTerminator(c0_) &&
297 40760 : c0_ != '=') {
298 36784 : name.AddChar(c0_);
299 : Advance();
300 : }
301 3996 : if (!name.is_one_byte()) return;
302 : Vector<const uint8_t> name_literal = name.one_byte_literal();
303 : LiteralBuffer* value;
304 3996 : if (name_literal == StaticCharVector("sourceURL")) {
305 3838 : value = &source_url_;
306 158 : } else if (name_literal == StaticCharVector("sourceMappingURL")) {
307 130 : value = &source_mapping_url_;
308 : } else {
309 : return;
310 : }
311 3968 : if (c0_ != '=')
312 : return;
313 : value->Start();
314 : Advance();
315 7956 : while (IsWhiteSpace(c0_)) {
316 : Advance();
317 : }
318 83755 : while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
319 : // Disallowed characters.
320 39804 : if (c0_ == '"' || c0_ == '\'') {
321 : value->Start();
322 : return;
323 : }
324 39784 : if (IsWhiteSpace(c0_)) {
325 : break;
326 : }
327 39744 : value->AddChar(c0_);
328 : Advance();
329 : }
330 : // Allow whitespace at the end.
331 4317 : while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
332 70 : if (!IsWhiteSpace(c0_)) {
333 : value->Start();
334 : break;
335 : }
336 : Advance();
337 : }
338 : }
339 :
340 6811485 : Token::Value Scanner::SkipMultiLineComment() {
341 : DCHECK_EQ(c0_, '*');
342 : Advance();
343 :
344 6566198 : while (c0_ != kEndOfInput) {
345 : DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
346 6632279 : if (!HasLineTerminatorBeforeNext() && unibrow::IsLineTerminator(c0_)) {
347 : // Following ECMA-262, section 7.4, a comment containing
348 : // a newline will make the comment count as a line-terminator.
349 462 : next().after_line_terminator = true;
350 : }
351 :
352 6696868 : while (V8_UNLIKELY(c0_ == '*')) {
353 : Advance();
354 187842 : if (c0_ == '/') {
355 : Advance();
356 57315 : return Token::WHITESPACE;
357 : }
358 : }
359 : Advance();
360 : }
361 :
362 : // Unterminated multi-line comment.
363 : return Token::ILLEGAL;
364 : }
365 :
366 25576 : void Scanner::SkipHashBang() {
367 25576 : if (c0_ == '#' && Peek() == '!' && source_pos() == 0) {
368 400 : SkipSingleLineComment();
369 : Scan();
370 : }
371 20776 : }
372 :
373 200 : Token::Value Scanner::ScanHtmlComment() {
374 : // Check for <!-- comments.
375 : DCHECK_EQ(c0_, '!');
376 : Advance();
377 128 : if (c0_ != '-' || Peek() != '-') {
378 : PushBack('!'); // undo Advance()
379 33 : return Token::LT;
380 : }
381 : Advance();
382 :
383 39 : found_html_comment_ = true;
384 39 : return SkipSingleHTMLComment();
385 : }
386 :
387 : #ifdef DEBUG
388 : void Scanner::SanityCheckTokenDesc(const TokenDesc& token) const {
389 : // Only TEMPLATE_* tokens can have a invalid_template_escape_message.
390 : // ILLEGAL and UNINITIALIZED can have garbage for the field.
391 :
392 : switch (token.token) {
393 : case Token::UNINITIALIZED:
394 : case Token::ILLEGAL:
395 : // token.literal_chars & other members might be garbage. That's ok.
396 : case Token::TEMPLATE_SPAN:
397 : case Token::TEMPLATE_TAIL:
398 : break;
399 : default:
400 : DCHECK_EQ(token.invalid_template_escape_message, MessageTemplate::kNone);
401 : break;
402 : }
403 : }
404 : #endif // DEBUG
405 :
406 241651 : void Scanner::SeekForward(int pos) {
407 : // After this call, we will have the token at the given position as
408 : // the "next" token. The "current" token will be invalid.
409 120838 : if (pos == next().location.beg_pos) return;
410 : int current_pos = source_pos();
411 : DCHECK_EQ(next().location.end_pos, current_pos);
412 : // Positions inside the lookahead token aren't supported.
413 : DCHECK(pos >= current_pos);
414 60414 : if (pos != current_pos) {
415 60409 : source_->Seek(pos);
416 : Advance();
417 : // This function is only called to seek to the location
418 : // of the end of a function (at the "}" token). It doesn't matter
419 : // whether there was a line terminator in the part we skip.
420 60409 : next().after_line_terminator = false;
421 : }
422 : Scan();
423 : }
424 :
425 : template <bool capture_raw>
426 52215286 : bool Scanner::ScanEscape() {
427 26115142 : uc32 c = c0_;
428 15016 : Advance<capture_raw>();
429 :
430 : // Skip escaped newlines.
431 : DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
432 52207028 : if (!capture_raw && unibrow::IsLineTerminator(c)) {
433 : // Allow escaped CR+LF newlines in multiline string literals.
434 11052 : if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
435 : return true;
436 : }
437 :
438 26107477 : switch (c) {
439 : case '\'': // fall through
440 : case '"' : // fall through
441 : case '\\': break;
442 59 : case 'b' : c = '\b'; break;
443 107 : case 'f' : c = '\f'; break;
444 466753 : case 'n' : c = '\n'; break;
445 729 : case 'r' : c = '\r'; break;
446 401 : case 't' : c = '\t'; break;
447 : case 'u' : {
448 103822 : c = ScanUnicodeEscape<capture_raw>();
449 103821 : if (c < 0) return false;
450 : break;
451 : }
452 : case 'v':
453 : c = '\v';
454 47 : break;
455 : case 'x': {
456 25406247 : c = ScanHexNumber<capture_raw>(2);
457 25371301 : if (c < 0) return false;
458 : break;
459 : }
460 : case '0': // Fall through.
461 : case '1': // fall through
462 : case '2': // fall through
463 : case '3': // fall through
464 : case '4': // fall through
465 : case '5': // fall through
466 : case '6': // fall through
467 : case '7':
468 4257 : c = ScanOctalEscape<capture_raw>(c, 2);
469 4257 : break;
470 : }
471 :
472 : // Other escaped characters are interpreted as their non-escaped version.
473 : AddLiteralChar(c);
474 : return true;
475 : }
476 :
477 : template <bool capture_raw>
478 9691 : uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
479 4257 : uc32 x = c - '0';
480 : int i = 0;
481 6758 : for (; i < length; i++) {
482 6142 : int d = c0_ - '0';
483 6142 : if (d < 0 || d > 7) break;
484 2519 : int nx = x * 8 + d;
485 2519 : if (nx >= 256) break;
486 : x = nx;
487 880 : Advance<capture_raw>();
488 : }
489 : // Anything except '\0' is an octal escape sequence, illegal in strict mode.
490 : // Remember the position of octal escape sequences so that an error
491 : // can be reported later (in strict mode).
492 : // We don't report the error immediately, because the octal escape can
493 : // occur before the "use strict" directive.
494 5293 : if (c != '0' || i > 0 || IsNonOctalDecimalDigit(c0_)) {
495 3813 : octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
496 3813 : octal_message_ = capture_raw ? MessageTemplate::kTemplateOctalLiteral
497 : : MessageTemplate::kStrictOctalEscape;
498 : }
499 4257 : return x;
500 : }
501 :
502 76871942 : Token::Value Scanner::ScanString() {
503 10352856 : uc32 quote = c0_;
504 : Advance(); // consume quote
505 :
506 : next().literal_chars.Start();
507 : while (true) {
508 36882084 : if (V8_UNLIKELY(c0_ == kEndOfInput)) return Token::ILLEGAL;
509 73777939 : if ((V8_UNLIKELY(static_cast<uint32_t>(c0_) >= kMaxAscii) &&
510 73749058 : !unibrow::IsStringLiteralLineTerminator(c0_)) ||
511 36866976 : !MayTerminateString(character_scan_flags[c0_])) {
512 : AddLiteralChar(c0_);
513 151732621 : AdvanceUntil([this](uc32 c0) {
514 151732621 : if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) {
515 945730 : if (V8_UNLIKELY(unibrow::IsStringLiteralLineTerminator(c0))) {
516 : return true;
517 : }
518 945812 : AddLiteralChar(c0);
519 : return false;
520 : }
521 150786891 : uint8_t char_flags = character_scan_flags[c0];
522 150786891 : if (MayTerminateString(char_flags)) return true;
523 141433523 : AddLiteralChar(c0);
524 : return false;
525 : });
526 : }
527 36882517 : if (c0_ == quote) {
528 : Advance();
529 10351652 : return Token::STRING;
530 : }
531 26530813 : if (c0_ == '\\') {
532 : Advance();
533 : // TODO(verwaest): Check whether we can remove the additional check.
534 26098392 : if (V8_UNLIKELY(c0_ == kEndOfInput || !ScanEscape<false>())) {
535 : return Token::ILLEGAL;
536 : }
537 : continue;
538 : }
539 860707 : if (V8_UNLIKELY(c0_ == kEndOfInput ||
540 : unibrow::IsStringLiteralLineTerminator(c0_))) {
541 : return Token::ILLEGAL;
542 : }
543 : DCHECK_NE(quote, c0_);
544 : DCHECK((c0_ == '\'' || c0_ == '"'));
545 : AddLiteralCharAdvance();
546 : }
547 : }
548 :
549 656107 : Token::Value Scanner::ScanPrivateName() {
550 277959 : if (!allow_harmony_private_fields()) {
551 : ReportScannerError(source_pos(),
552 : MessageTemplate::kInvalidOrUnexpectedToken);
553 : return Token::ILLEGAL;
554 : }
555 :
556 : next().literal_chars.Start();
557 : DCHECK_EQ(c0_, '#');
558 : DCHECK(!IsIdentifierStart(kEndOfInput));
559 79709 : if (!IsIdentifierStart(Peek())) {
560 : ReportScannerError(source_pos(),
561 : MessageTemplate::kInvalidOrUnexpectedToken);
562 : return Token::ILLEGAL;
563 : }
564 :
565 : AddLiteralCharAdvance();
566 : Token::Value token = ScanIdentifierOrKeywordInner();
567 59229 : return token == Token::ILLEGAL ? Token::ILLEGAL : Token::PRIVATE_NAME;
568 : }
569 :
570 2982916 : Token::Value Scanner::ScanTemplateSpan() {
571 : // When scanning a TemplateSpan, we are looking for the following construct:
572 : // TEMPLATE_SPAN ::
573 : // ` LiteralChars* ${
574 : // | } LiteralChars* ${
575 : //
576 : // TEMPLATE_TAIL ::
577 : // ` LiteralChars* `
578 : // | } LiteralChar* `
579 : //
580 : // A TEMPLATE_SPAN should always be followed by an Expression, while a
581 : // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
582 : // followed by an Expression.
583 :
584 : // These scoped helpers save and restore the original error state, so that we
585 : // can specially treat invalid escape sequences in templates (which are
586 : // handled by the parser).
587 154164 : ErrorState scanner_error_state(&scanner_error_, &scanner_error_location_);
588 154164 : ErrorState octal_error_state(&octal_message_, &octal_pos_);
589 :
590 : Token::Value result = Token::TEMPLATE_SPAN;
591 : next().literal_chars.Start();
592 : next().raw_literal_chars.Start();
593 : const bool capture_raw = true;
594 : while (true) {
595 2041631 : uc32 c = c0_;
596 2041631 : if (c == '`') {
597 : Advance(); // Consume '`'
598 : result = Token::TEMPLATE_TAIL;
599 : break;
600 2060585 : } else if (c == '$' && Peek() == '{') {
601 : Advance(); // Consume '$'
602 : Advance(); // Consume '{'
603 : break;
604 1889199 : } else if (c == '\\') {
605 : Advance(); // Consume '\\'
606 : DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
607 : if (capture_raw) AddRawLiteralChar('\\');
608 30352 : if (unibrow::IsLineTerminator(c0_)) {
609 : // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
610 : // code unit sequence.
611 : uc32 lastChar = c0_;
612 : Advance();
613 160 : if (lastChar == '\r') {
614 : // Also skip \n.
615 92 : if (c0_ == '\n') Advance();
616 : lastChar = '\n';
617 : }
618 : if (capture_raw) AddRawLiteralChar(lastChar);
619 : } else {
620 15016 : bool success = ScanEscape<capture_raw>();
621 : USE(success);
622 : DCHECK_EQ(!success, has_error());
623 : // For templates, invalid escape sequence checking is handled in the
624 : // parser.
625 15015 : scanner_error_state.MoveErrorTo(next_);
626 15015 : octal_error_state.MoveErrorTo(next_);
627 : }
628 1874023 : } else if (c < 0) {
629 : // Unterminated template literal
630 : break;
631 : } else {
632 : Advance(); // Consume c.
633 : // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
634 : // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
635 : // consisting of the CV 0x000A.
636 1872480 : if (c == '\r') {
637 614 : if (c0_ == '\n') Advance(); // Consume '\n'
638 : c = '\n';
639 : }
640 : if (capture_raw) AddRawLiteralChar(c);
641 : AddLiteralChar(c);
642 : }
643 : }
644 154203 : next().location.end_pos = source_pos();
645 154203 : next().token = result;
646 :
647 154203 : return result;
648 : }
649 :
650 1718309 : Handle<String> Scanner::SourceUrl(Isolate* isolate) const {
651 : Handle<String> tmp;
652 1718309 : if (source_url_.length() > 0) {
653 3803 : tmp = source_url_.Internalize(isolate);
654 : }
655 1718309 : return tmp;
656 : }
657 :
658 1718312 : Handle<String> Scanner::SourceMappingUrl(Isolate* isolate) const {
659 : Handle<String> tmp;
660 1718312 : if (source_mapping_url_.length() > 0) {
661 100 : tmp = source_mapping_url_.Internalize(isolate);
662 : }
663 1718312 : return tmp;
664 : }
665 :
666 2697 : bool Scanner::ScanDigitsWithNumericSeparators(bool (*predicate)(uc32 ch),
667 2894 : bool is_check_first_digit) {
668 : // we must have at least one digit after 'x'/'b'/'o'
669 2697 : if (is_check_first_digit && !predicate(c0_)) return false;
670 :
671 : bool separator_seen = false;
672 7863 : while (predicate(c0_) || c0_ == '_') {
673 5780 : if (c0_ == '_') {
674 : Advance();
675 2200 : if (c0_ == '_') {
676 : ReportScannerError(Location(source_pos(), source_pos() + 1),
677 : MessageTemplate::kContinuousNumericSeparator);
678 : return false;
679 : }
680 : separator_seen = true;
681 : continue;
682 : }
683 : separator_seen = false;
684 : AddLiteralCharAdvance();
685 : }
686 :
687 2083 : if (separator_seen) {
688 : ReportScannerError(Location(source_pos(), source_pos() + 1),
689 : MessageTemplate::kTrailingNumericSeparator);
690 : return false;
691 : }
692 :
693 : return true;
694 : }
695 :
696 2197785 : bool Scanner::ScanDecimalDigits() {
697 2197785 : if (allow_harmony_numeric_separator()) {
698 1620 : return ScanDigitsWithNumericSeparators(&IsDecimalDigit, false);
699 : }
700 7520398 : while (IsDecimalDigit(c0_)) {
701 : AddLiteralCharAdvance();
702 : }
703 : return true;
704 : }
705 :
706 59519 : bool Scanner::ScanDecimalAsSmiWithNumericSeparators(uint64_t* value) {
707 : bool separator_seen = false;
708 153047 : while (IsDecimalDigit(c0_) || c0_ == '_') {
709 24535 : if (c0_ == '_') {
710 : Advance();
711 1068 : if (c0_ == '_') {
712 : ReportScannerError(Location(source_pos(), source_pos() + 1),
713 : MessageTemplate::kContinuousNumericSeparator);
714 : return false;
715 : }
716 : separator_seen = true;
717 : continue;
718 : }
719 : separator_seen = false;
720 23468 : *value = 10 * *value + (c0_ - '0');
721 23468 : uc32 first_char = c0_;
722 : Advance();
723 : AddLiteralChar(first_char);
724 : }
725 :
726 34630 : if (separator_seen) {
727 : ReportScannerError(Location(source_pos(), source_pos() + 1),
728 : MessageTemplate::kTrailingNumericSeparator);
729 : return false;
730 : }
731 :
732 : return true;
733 : }
734 :
735 85566617 : bool Scanner::ScanDecimalAsSmi(uint64_t* value) {
736 32409935 : if (allow_harmony_numeric_separator()) {
737 34717 : return ScanDecimalAsSmiWithNumericSeparators(value);
738 : }
739 :
740 171064020 : while (IsDecimalDigit(c0_)) {
741 53156682 : *value = 10 * *value + (c0_ - '0');
742 53156682 : uc32 first_char = c0_;
743 : Advance();
744 : AddLiteralChar(first_char);
745 : }
746 : return true;
747 : }
748 :
749 757 : bool Scanner::ScanBinaryDigits() {
750 757 : if (allow_harmony_numeric_separator()) {
751 359 : return ScanDigitsWithNumericSeparators(&IsBinaryDigit, true);
752 : }
753 :
754 : // we must have at least one binary digit after 'b'/'B'
755 796 : if (!IsBinaryDigit(c0_)) {
756 : return false;
757 : }
758 :
759 1596 : while (IsBinaryDigit(c0_)) {
760 : AddLiteralCharAdvance();
761 : }
762 : return true;
763 : }
764 :
765 768 : bool Scanner::ScanOctalDigits() {
766 768 : if (allow_harmony_numeric_separator()) {
767 359 : return ScanDigitsWithNumericSeparators(&IsOctalDigit, true);
768 : }
769 :
770 : // we must have at least one octal digit after 'o'/'O'
771 818 : if (!IsOctalDigit(c0_)) {
772 : return false;
773 : }
774 :
775 1752 : while (IsOctalDigit(c0_)) {
776 : AddLiteralCharAdvance();
777 : }
778 : return true;
779 : }
780 :
781 199945 : bool Scanner::ScanImplicitOctalDigits(int start_pos,
782 199940 : Scanner::NumberKind* kind) {
783 199945 : *kind = IMPLICIT_OCTAL;
784 :
785 : while (true) {
786 : // (possible) octal number
787 808026 : if (IsNonOctalDecimalDigit(c0_)) {
788 6 : *kind = DECIMAL_WITH_LEADING_ZERO;
789 6 : return true;
790 : }
791 404007 : if (!IsOctalDigit(c0_)) {
792 : // Octal literal finished.
793 199940 : octal_pos_ = Location(start_pos, source_pos());
794 199940 : octal_message_ = MessageTemplate::kStrictOctalLiteral;
795 199940 : return true;
796 : }
797 : AddLiteralCharAdvance();
798 : }
799 : }
800 :
801 476085 : bool Scanner::ScanHexDigits() {
802 476085 : if (allow_harmony_numeric_separator()) {
803 359 : return ScanDigitsWithNumericSeparators(&IsHexDigit, true);
804 : }
805 :
806 : // we must have at least one hex digit after 'x'/'X'
807 951452 : if (!IsHexDigit(c0_)) {
808 : return false;
809 : }
810 :
811 3362554 : while (IsHexDigit(c0_)) {
812 : AddLiteralCharAdvance();
813 : }
814 : return true;
815 : }
816 :
817 15582 : bool Scanner::ScanSignedInteger() {
818 15582 : if (c0_ == '+' || c0_ == '-') AddLiteralCharAdvance();
819 : // we must have at least one decimal digit after 'e'/'E'
820 31162 : if (!IsDecimalDigit(c0_)) return false;
821 15207 : return ScanDecimalDigits();
822 : }
823 :
824 174915599 : Token::Value Scanner::ScanNumber(bool seen_period) {
825 : DCHECK(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
826 :
827 33091206 : NumberKind kind = DECIMAL;
828 :
829 : next().literal_chars.Start();
830 33091206 : bool at_start = !seen_period;
831 : int start_pos = source_pos(); // For reporting octal positions.
832 33091206 : if (seen_period) {
833 : // we have already seen a decimal point of the float
834 : AddLiteralChar('.');
835 3066 : if (allow_harmony_numeric_separator() && c0_ == '_') {
836 : return Token::ILLEGAL;
837 : }
838 : // we know we have at least one digit
839 3066 : if (!ScanDecimalDigits()) return Token::ILLEGAL;
840 : } else {
841 : // if the first character is '0' we must check for octals and hex
842 33088140 : if (c0_ == '0') {
843 : AddLiteralCharAdvance();
844 :
845 : // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
846 : // an octal number.
847 11610335 : if (c0_ == 'x' || c0_ == 'X') {
848 : AddLiteralCharAdvance();
849 476096 : kind = HEX;
850 476096 : if (!ScanHexDigits()) return Token::ILLEGAL;
851 11134148 : } else if (c0_ == 'o' || c0_ == 'O') {
852 : AddLiteralCharAdvance();
853 768 : kind = OCTAL;
854 768 : if (!ScanOctalDigits()) return Token::ILLEGAL;
855 11133380 : } else if (c0_ == 'b' || c0_ == 'B') {
856 : AddLiteralCharAdvance();
857 757 : kind = BINARY;
858 757 : if (!ScanBinaryDigits()) return Token::ILLEGAL;
859 11132623 : } else if (IsOctalDigit(c0_)) {
860 199945 : kind = IMPLICIT_OCTAL;
861 199945 : if (!ScanImplicitOctalDigits(start_pos, &kind)) {
862 : return Token::ILLEGAL;
863 : }
864 199946 : if (kind == DECIMAL_WITH_LEADING_ZERO) {
865 : at_start = false;
866 : }
867 10932678 : } else if (IsNonOctalDecimalDigit(c0_)) {
868 48816 : kind = DECIMAL_WITH_LEADING_ZERO;
869 10883862 : } else if (allow_harmony_numeric_separator() && c0_ == '_') {
870 : ReportScannerError(Location(source_pos(), source_pos() + 1),
871 : MessageTemplate::kZeroDigitNumericSeparator);
872 : return Token::ILLEGAL;
873 : }
874 : }
875 :
876 : // Parse decimal digits and allow trailing fractional part.
877 33086319 : if (kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO) {
878 : // This is an optimization for parsing Decimal numbers as Smi's.
879 32409825 : if (at_start) {
880 32409906 : uint64_t value = 0;
881 : // scan subsequent decimal digits
882 32409906 : if (!ScanDecimalAsSmi(&value)) {
883 31256587 : return Token::ILLEGAL;
884 : }
885 :
886 64789712 : if (next().literal_chars.one_byte_literal().length() <= 10 &&
887 95969218 : value <= Smi::kMaxValue && c0_ != '.' && !IsIdentifierStart(c0_)) {
888 31256227 : next().smi_value_ = static_cast<uint32_t>(value);
889 :
890 31256227 : if (kind == DECIMAL_WITH_LEADING_ZERO) {
891 48816 : octal_pos_ = Location(start_pos, source_pos());
892 48816 : octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
893 : }
894 : return Token::SMI;
895 : }
896 : }
897 :
898 1153379 : if (!ScanDecimalDigits()) return Token::ILLEGAL;
899 1153451 : if (c0_ == '.') {
900 : seen_period = true;
901 : AddLiteralCharAdvance();
902 1026165 : if (allow_harmony_numeric_separator() && c0_ == '_') {
903 : return Token::ILLEGAL;
904 : }
905 1026081 : if (!ScanDecimalDigits()) return Token::ILLEGAL;
906 : }
907 : }
908 : }
909 :
910 : bool is_bigint = false;
911 3679267 : if (c0_ == 'n' && !seen_period &&
912 13751 : (kind == DECIMAL || kind == HEX || kind == OCTAL || kind == BINARY)) {
913 : // Check that the literal is within our limits for BigInt length.
914 : // For simplicity, use 4 bits per character to calculate the maximum
915 : // allowed literal length.
916 : static const int kMaxBigIntCharacters = BigInt::kMaxLengthBits / 4;
917 13709 : int length = source_pos() - start_pos - (kind != DECIMAL ? 2 : 0);
918 13709 : if (length > kMaxBigIntCharacters) {
919 : ReportScannerError(Location(start_pos, source_pos()),
920 : MessageTemplate::kBigIntTooBig);
921 : return Token::ILLEGAL;
922 : }
923 :
924 : is_bigint = true;
925 : Advance();
926 1819061 : } else if (c0_ == 'e' || c0_ == 'E') {
927 : // scan exponent, if any
928 : DCHECK(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
929 :
930 15583 : if (!(kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO))
931 : return Token::ILLEGAL;
932 :
933 : // scan exponent
934 : AddLiteralCharAdvance();
935 :
936 15582 : if (!ScanSignedInteger()) return Token::ILLEGAL;
937 : }
938 :
939 : // The source character immediately following a numeric literal must
940 : // not be an identifier start or a decimal digit; see ECMA-262
941 : // section 7.8.3, page 17 (note that we read only one decimal digit
942 : // if the value is 0).
943 5497158 : if (IsDecimalDigit(c0_) || IsIdentifierStart(c0_)) {
944 : return Token::ILLEGAL;
945 : }
946 :
947 1828717 : if (kind == DECIMAL_WITH_LEADING_ZERO) {
948 6 : octal_pos_ = Location(start_pos, source_pos());
949 6 : octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
950 : }
951 :
952 1828717 : return is_bigint ? Token::BIGINT : Token::NUMBER;
953 : }
954 :
955 68215 : uc32 Scanner::ScanIdentifierUnicodeEscape() {
956 : Advance();
957 37812 : if (c0_ != 'u') return -1;
958 : Advance();
959 30403 : return ScanUnicodeEscape<false>();
960 : }
961 :
962 : template <bool capture_raw>
963 144728 : uc32 Scanner::ScanUnicodeEscape() {
964 : // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of
965 : // hex digits between { } is arbitrary. \ and u have already been read.
966 134227 : if (c0_ == '{') {
967 44586 : int begin = source_pos() - 2;
968 5629 : Advance<capture_raw>();
969 44583 : uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10FFFF, begin);
970 44585 : if (cp < 0 || c0_ != '}') {
971 : ReportScannerError(source_pos(),
972 : MessageTemplate::kInvalidUnicodeEscapeSequence);
973 : return -1;
974 : }
975 757 : Advance<capture_raw>();
976 38181 : return cp;
977 : }
978 : const bool unicode = true;
979 89641 : return ScanHexNumber<capture_raw, unicode>(4);
980 : }
981 :
982 16426 : Token::Value Scanner::ScanIdentifierOrKeywordInnerSlow(bool escaped,
983 29390 : bool can_be_keyword) {
984 : while (true) {
985 71374 : if (c0_ == '\\') {
986 : escaped = true;
987 13201 : uc32 c = ScanIdentifierUnicodeEscape();
988 : // Only allow legal identifier part characters.
989 : // TODO(verwaest): Make this true.
990 : // DCHECK(!IsIdentifierPart('\'));
991 : DCHECK(!IsIdentifierPart(-1));
992 26400 : if (c == '\\' || !IsIdentifierPart(c)) {
993 : return Token::ILLEGAL;
994 : }
995 25163 : can_be_keyword = can_be_keyword && CharCanBeKeyword(c);
996 : AddLiteralChar(c);
997 132248 : } else if (IsIdentifierPart(c0_) ||
998 16046 : (CombineSurrogatePair() && IsIdentifierPart(c0_))) {
999 83448 : can_be_keyword = can_be_keyword && CharCanBeKeyword(c0_);
1000 : AddLiteralCharAdvance();
1001 : } else {
1002 : break;
1003 : }
1004 : }
1005 :
1006 30543 : if (can_be_keyword && next().literal_chars.is_one_byte()) {
1007 : Vector<const uint8_t> chars = next().literal_chars.one_byte_literal();
1008 : Token::Value token =
1009 : KeywordOrIdentifierToken(chars.start(), chars.length());
1010 14695 : if (IsInRange(token, Token::IDENTIFIER, Token::YIELD)) return token;
1011 :
1012 12792 : if (token == Token::FUTURE_STRICT_RESERVED_WORD) {
1013 2280 : if (escaped) return Token::ESCAPED_STRICT_RESERVED_WORD;
1014 0 : return token;
1015 : }
1016 :
1017 10512 : if (!escaped) return token;
1018 :
1019 : STATIC_ASSERT(Token::LET + 1 == Token::STATIC);
1020 10512 : if (IsInRange(token, Token::LET, Token::STATIC)) {
1021 : return Token::ESCAPED_STRICT_RESERVED_WORD;
1022 : }
1023 7260 : return Token::ESCAPED_KEYWORD;
1024 : }
1025 :
1026 : return Token::IDENTIFIER;
1027 : }
1028 :
1029 317059 : bool Scanner::ScanRegExpPattern() {
1030 : DCHECK_EQ(Token::UNINITIALIZED, next_next().token);
1031 : DCHECK(next().token == Token::DIV || next().token == Token::ASSIGN_DIV);
1032 :
1033 : // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1034 : bool in_character_class = false;
1035 :
1036 : // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1037 : // the scanner should pass uninterpreted bodies to the RegExp
1038 : // constructor.
1039 : next().literal_chars.Start();
1040 79375 : if (next().token == Token::ASSIGN_DIV) {
1041 : AddLiteralChar('=');
1042 : }
1043 :
1044 979574 : while (c0_ != '/' || in_character_class) {
1045 1800549 : if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) {
1046 : return false;
1047 : }
1048 900164 : if (c0_ == '\\') { // Escape sequence.
1049 : AddLiteralCharAdvance();
1050 184875 : if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) {
1051 : return false;
1052 : }
1053 : AddLiteralCharAdvance();
1054 : // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1055 : // only "safe" characters are allowed (letters, digits, underscore),
1056 : // otherwise the escape isn't valid and the invalid character has
1057 : // its normal meaning. I.e., we can just continue scanning without
1058 : // worrying whether the following characters are part of the escape
1059 : // or not, since any '/', '\\' or '[' is guaranteed to not be part
1060 : // of the escape sequence.
1061 :
1062 : // TODO(896): At some point, parse RegExps more thoroughly to capture
1063 : // octal esacpes in strict mode.
1064 : } else { // Unescaped character.
1065 807715 : if (c0_ == '[') in_character_class = true;
1066 807715 : if (c0_ == ']') in_character_class = false;
1067 : AddLiteralCharAdvance();
1068 : }
1069 : }
1070 : Advance(); // consume '/'
1071 :
1072 79155 : next().token = Token::REGEXP_LITERAL;
1073 79155 : return true;
1074 : }
1075 :
1076 :
1077 192165 : Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() {
1078 : DCHECK_EQ(Token::REGEXP_LITERAL, next().token);
1079 :
1080 : // Scan regular expression flags.
1081 : int flags = 0;
1082 306504 : while (IsIdentifierPart(c0_)) {
1083 : RegExp::Flags flag = RegExp::kNone;
1084 35297 : switch (c0_) {
1085 : case 'g':
1086 : flag = RegExp::kGlobal;
1087 : break;
1088 : case 'i':
1089 : flag = RegExp::kIgnoreCase;
1090 4518 : break;
1091 : case 'm':
1092 : flag = RegExp::kMultiline;
1093 590 : break;
1094 : case 's':
1095 : flag = RegExp::kDotAll;
1096 156 : break;
1097 : case 'u':
1098 : flag = RegExp::kUnicode;
1099 7286 : break;
1100 : case 'y':
1101 : flag = RegExp::kSticky;
1102 123 : break;
1103 : default:
1104 : return Nothing<RegExp::Flags>();
1105 : }
1106 34812 : if (flags & flag) {
1107 : return Nothing<RegExp::Flags>();
1108 : }
1109 : Advance();
1110 34688 : flags |= flag;
1111 : }
1112 :
1113 78439 : next().location.end_pos = source_pos();
1114 78439 : return Just(RegExp::Flags(flags));
1115 : }
1116 :
1117 99715910 : const AstRawString* Scanner::CurrentSymbol(
1118 : AstValueFactory* ast_value_factory) const {
1119 99715910 : if (is_literal_one_byte()) {
1120 99677852 : return ast_value_factory->GetOneByteString(literal_one_byte_string());
1121 : }
1122 74130 : return ast_value_factory->GetTwoByteString(literal_two_byte_string());
1123 : }
1124 :
1125 553164 : const AstRawString* Scanner::NextSymbol(
1126 : AstValueFactory* ast_value_factory) const {
1127 553164 : if (is_next_literal_one_byte()) {
1128 551044 : return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
1129 : }
1130 2138 : return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
1131 : }
1132 :
1133 82293 : const AstRawString* Scanner::CurrentRawSymbol(
1134 : AstValueFactory* ast_value_factory) const {
1135 82293 : if (is_raw_literal_one_byte()) {
1136 82263 : return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
1137 : }
1138 33 : return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
1139 : }
1140 :
1141 :
1142 1250767 : double Scanner::DoubleValue() {
1143 : DCHECK(is_literal_one_byte());
1144 : return StringToDouble(
1145 : literal_one_byte_string(),
1146 1250767 : ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
1147 : }
1148 :
1149 12251 : const char* Scanner::CurrentLiteralAsCString(Zone* zone) const {
1150 : DCHECK(is_literal_one_byte());
1151 : Vector<const uint8_t> vector = literal_one_byte_string();
1152 : int length = vector.length();
1153 12251 : char* buffer = zone->NewArray<char>(length + 1);
1154 : memcpy(buffer, vector.start(), length);
1155 12251 : buffer[length] = '\0';
1156 12251 : return buffer;
1157 : }
1158 :
1159 96367 : void Scanner::SeekNext(size_t position) {
1160 : // Use with care: This cleanly resets most, but not all scanner state.
1161 : // TODO(vogelheim): Fix this, or at least DCHECK the relevant conditions.
1162 :
1163 : // To re-scan from a given character position, we need to:
1164 : // 1, Reset the current_, next_ and next_next_ tokens
1165 : // (next_ + next_next_ will be overwrittem by Next(),
1166 : // current_ will remain unchanged, so overwrite it fully.)
1167 192730 : for (TokenDesc& token : token_storage_) {
1168 144546 : token.token = Token::UNINITIALIZED;
1169 144546 : token.invalid_template_escape_message = MessageTemplate::kNone;
1170 : }
1171 : // 2, reset the source to the desired position,
1172 48184 : source_->Seek(position);
1173 : // 3, re-scan, by scanning the look-ahead char + 1 token (next_).
1174 96366 : c0_ = source_->Advance();
1175 48183 : next().after_line_terminator = false;
1176 : Scan();
1177 : DCHECK_EQ(next().location.beg_pos, static_cast<int>(position));
1178 48182 : }
1179 :
1180 : } // namespace internal
1181 178779 : } // namespace v8
|