Line data Source code
1 : // Copyright 2011 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : // Features shared by parsing and pre-parsing scanners.
6 :
7 : #include "src/parsing/scanner.h"
8 :
9 : #include <stdint.h>
10 :
11 : #include <cmath>
12 :
13 : #include "src/ast/ast-value-factory.h"
14 : #include "src/conversions-inl.h"
15 : #include "src/objects/bigint.h"
16 : #include "src/parsing/scanner-inl.h"
17 : #include "src/zone/zone.h"
18 :
19 : namespace v8 {
20 : namespace internal {
21 :
22 : class Scanner::ErrorState {
23 : public:
24 : ErrorState(MessageTemplate* message_stack, Scanner::Location* location_stack)
25 : : message_stack_(message_stack),
26 : old_message_(*message_stack),
27 : location_stack_(location_stack),
28 314160 : old_location_(*location_stack) {
29 314160 : *message_stack_ = MessageTemplate::kNone;
30 314160 : *location_stack_ = Location::invalid();
31 : }
32 :
33 : ~ErrorState() {
34 314162 : *message_stack_ = old_message_;
35 314162 : *location_stack_ = old_location_;
36 : }
37 :
38 : void MoveErrorTo(TokenDesc* dest) {
39 30012 : if (*message_stack_ == MessageTemplate::kNone) {
40 : return;
41 : }
42 13078 : if (dest->invalid_template_escape_message == MessageTemplate::kNone) {
43 13078 : dest->invalid_template_escape_message = *message_stack_;
44 13078 : dest->invalid_template_escape_location = *location_stack_;
45 : }
46 13078 : *message_stack_ = MessageTemplate::kNone;
47 13078 : *location_stack_ = Location::invalid();
48 : }
49 :
50 : private:
51 : MessageTemplate* const message_stack_;
52 : MessageTemplate const old_message_;
53 : Scanner::Location* const location_stack_;
54 : Scanner::Location const old_location_;
55 : };
56 :
57 : // ----------------------------------------------------------------------------
58 : // Scanner::LiteralBuffer
59 :
60 3927 : Handle<String> Scanner::LiteralBuffer::Internalize(Isolate* isolate) const {
61 3927 : if (is_one_byte()) {
62 3927 : return isolate->factory()->InternalizeOneByteString(one_byte_literal());
63 : }
64 0 : return isolate->factory()->InternalizeTwoByteString(two_byte_literal());
65 : }
66 :
67 0 : int Scanner::LiteralBuffer::NewCapacity(int min_capacity) {
68 : return min_capacity < (kMaxGrowth / (kGrowthFactor - 1))
69 : ? min_capacity * kGrowthFactor
70 5813362 : : min_capacity + kMaxGrowth;
71 : }
72 :
73 5810855 : void Scanner::LiteralBuffer::ExpandBuffer() {
74 : int min_capacity = Max(kInitialCapacity, backing_store_.length());
75 : Vector<byte> new_store = Vector<byte>::New(NewCapacity(min_capacity));
76 5810935 : if (position_ > 0) {
77 145151 : MemCopy(new_store.start(), backing_store_.start(), position_);
78 : }
79 : backing_store_.Dispose();
80 5810936 : backing_store_ = new_store;
81 5810936 : }
82 :
83 78150 : void Scanner::LiteralBuffer::ConvertToTwoByte() {
84 : DCHECK(is_one_byte());
85 : Vector<byte> new_store;
86 78150 : int new_content_size = position_ * kUC16Size;
87 78150 : if (new_content_size >= backing_store_.length()) {
88 : // Ensure room for all currently read code units as UC16 as well
89 : // as the code unit about to be stored.
90 : new_store = Vector<byte>::New(NewCapacity(new_content_size));
91 : } else {
92 75643 : new_store = backing_store_;
93 : }
94 : uint8_t* src = backing_store_.start();
95 : uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start());
96 194987 : for (int i = position_ - 1; i >= 0; i--) {
97 116834 : dst[i] = src[i];
98 : }
99 78153 : if (new_store.start() != backing_store_.start()) {
100 : backing_store_.Dispose();
101 2508 : backing_store_ = new_store;
102 : }
103 78153 : position_ = new_content_size;
104 78153 : is_one_byte_ = false;
105 78153 : }
106 :
107 8797984 : void Scanner::LiteralBuffer::AddTwoByteChar(uc32 code_unit) {
108 : DCHECK(!is_one_byte());
109 17595968 : if (position_ >= backing_store_.length()) ExpandBuffer();
110 8798166 : if (code_unit <=
111 : static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
112 17537746 : *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
113 8768873 : position_ += kUC16Size;
114 : } else {
115 29293 : *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
116 58586 : unibrow::Utf16::LeadSurrogate(code_unit);
117 29293 : position_ += kUC16Size;
118 29293 : if (position_ >= backing_store_.length()) ExpandBuffer();
119 29293 : *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
120 29293 : unibrow::Utf16::TrailSurrogate(code_unit);
121 29293 : position_ += kUC16Size;
122 : }
123 8798166 : }
124 :
125 : // ----------------------------------------------------------------------------
126 : // Scanner::BookmarkScope
127 :
128 : const size_t Scanner::BookmarkScope::kNoBookmark =
129 : std::numeric_limits<size_t>::max() - 1;
130 : const size_t Scanner::BookmarkScope::kBookmarkWasApplied =
131 : std::numeric_limits<size_t>::max();
132 :
133 2564943 : void Scanner::BookmarkScope::Set(size_t position) {
134 : DCHECK_EQ(bookmark_, kNoBookmark);
135 2564943 : bookmark_ = position;
136 2564943 : }
137 :
138 49124 : void Scanner::BookmarkScope::Apply() {
139 : DCHECK(HasBeenSet()); // Caller hasn't called SetBookmark.
140 49124 : if (had_parser_error_) {
141 0 : scanner_->set_parser_error();
142 : } else {
143 49124 : scanner_->reset_parser_error_flag();
144 49124 : scanner_->SeekNext(bookmark_);
145 : }
146 49124 : bookmark_ = kBookmarkWasApplied;
147 49124 : }
148 :
149 0 : bool Scanner::BookmarkScope::HasBeenSet() const {
150 0 : return bookmark_ != kNoBookmark && bookmark_ != kBookmarkWasApplied;
151 : }
152 :
153 0 : bool Scanner::BookmarkScope::HasBeenApplied() const {
154 0 : return bookmark_ == kBookmarkWasApplied;
155 : }
156 :
157 : // ----------------------------------------------------------------------------
158 : // Scanner
159 :
160 2991665 : Scanner::Scanner(Utf16CharacterStream* source, bool is_module)
161 : : source_(source),
162 : found_html_comment_(false),
163 : allow_harmony_numeric_separator_(false),
164 : is_module_(is_module),
165 : octal_pos_(Location::invalid()),
166 14958235 : octal_message_(MessageTemplate::kNone) {
167 : DCHECK_NOT_NULL(source);
168 2991665 : }
169 :
170 2991664 : void Scanner::Initialize() {
171 : // Need to capture identifiers in order to recognize "get" and "set"
172 : // in object literals.
173 2991664 : Init();
174 2991669 : next().after_line_terminator = true;
175 : Scan();
176 2991667 : }
177 :
178 : template <bool capture_raw, bool unicode>
179 25145502 : uc32 Scanner::ScanHexNumber(int expected_length) {
180 : DCHECK_LE(expected_length, 4); // prevent overflow
181 :
182 25145502 : int begin = source_pos() - 2;
183 : uc32 x = 0;
184 125726546 : for (int i = 0; i < expected_length; i++) {
185 50458786 : int d = HexValue(c0_);
186 50458786 : if (d < 0) {
187 6989 : ReportScannerError(Location(begin, begin + expected_length + 2),
188 : unicode
189 : ? MessageTemplate::kInvalidUnicodeEscapeSequence
190 : : MessageTemplate::kInvalidHexEscapeSequence);
191 : return -1;
192 : }
193 50451797 : x = x * 16 + d;
194 6428 : Advance<capture_raw>();
195 : }
196 :
197 : return x;
198 : }
199 :
200 : template <bool capture_raw>
201 47283 : uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value, int beg_pos) {
202 : uc32 x = 0;
203 47283 : int d = HexValue(c0_);
204 47283 : if (d < 0) return -1;
205 :
206 427505 : while (d >= 0) {
207 193273 : x = x * 16 + d;
208 193273 : if (x > max_value) {
209 : ReportScannerError(Location(beg_pos, source_pos() + 1),
210 : MessageTemplate::kUndefinedUnicodeCodePoint);
211 : return -1;
212 : }
213 6889 : Advance<capture_raw>();
214 6889 : d = HexValue(c0_);
215 : }
216 :
217 : return x;
218 : }
219 :
220 403658414 : Token::Value Scanner::Next() {
221 : // Rotate through tokens.
222 403658414 : TokenDesc* previous = current_;
223 403658414 : current_ = next_;
224 : // Either we already have the next token lined up, in which case next_next_
225 : // simply becomes next_. In that case we use current_ as new next_next_ and
226 : // clear its token to indicate that it wasn't scanned yet. Otherwise we use
227 : // current_ as next_ and scan into it, leaving next_next_ uninitialized.
228 403658414 : if (V8_LIKELY(next_next().token == Token::UNINITIALIZED)) {
229 400928920 : next_ = previous;
230 : // User 'previous' instead of 'next_' because for some reason the compiler
231 : // thinks 'next_' could be modified before the entry into Scan.
232 400928920 : previous->after_line_terminator = false;
233 : Scan(previous);
234 : } else {
235 2729494 : next_ = next_next_;
236 2729494 : next_next_ = previous;
237 2729494 : previous->token = Token::UNINITIALIZED;
238 : DCHECK_NE(Token::UNINITIALIZED, current().token);
239 : }
240 403640430 : return current().token;
241 : }
242 :
243 2205419 : Token::Value Scanner::PeekAhead() {
244 : DCHECK(next().token != Token::DIV);
245 : DCHECK(next().token != Token::ASSIGN_DIV);
246 :
247 2205419 : if (next_next().token != Token::UNINITIALIZED) {
248 : return next_next().token;
249 : }
250 2033961 : TokenDesc* temp = next_;
251 2033961 : next_ = next_next_;
252 2033961 : next().after_line_terminator = false;
253 : Scan();
254 2033971 : next_next_ = next_;
255 2033971 : next_ = temp;
256 2033971 : return next_next().token;
257 : }
258 :
259 193 : Token::Value Scanner::SkipSingleHTMLComment() {
260 193 : if (is_module_) {
261 : ReportScannerError(source_pos(), MessageTemplate::kHtmlCommentInModule);
262 : return Token::ILLEGAL;
263 : }
264 188 : return SkipSingleLineComment();
265 : }
266 :
267 7264887 : Token::Value Scanner::SkipSingleLineComment() {
268 : // The line terminator at the end of the line is not considered
269 : // to be part of the single-line comment; it is recognized
270 : // separately by the lexical grammar and becomes part of the
271 : // stream of input elements for the syntactic grammar (see
272 : // ECMA-262, section 7.4).
273 372247773 : AdvanceUntil([](uc32 c0_) { return unibrow::IsLineTerminator(c0_); });
274 :
275 7265484 : return Token::WHITESPACE;
276 : }
277 :
278 4029 : Token::Value Scanner::SkipSourceURLComment() {
279 4029 : TryToParseSourceURLComment();
280 8058 : if (unibrow::IsLineTerminator(c0_) || c0_ == kEndOfInput) {
281 : return Token::WHITESPACE;
282 : }
283 78 : return SkipSingleLineComment();
284 : }
285 :
286 4029 : void Scanner::TryToParseSourceURLComment() {
287 : // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
288 : // function will just return if it cannot parse a magic comment.
289 : DCHECK(!IsWhiteSpaceOrLineTerminator(kEndOfInput));
290 8125 : if (!IsWhiteSpace(c0_)) return;
291 : Advance();
292 : LiteralBuffer name;
293 : name.Start();
294 :
295 123040 : while (c0_ != kEndOfInput && !IsWhiteSpaceOrLineTerminator(c0_) &&
296 41000 : c0_ != '=') {
297 37000 : name.AddChar(c0_);
298 : Advance();
299 : }
300 4020 : if (!name.is_one_byte()) return;
301 : Vector<const uint8_t> name_literal = name.one_byte_literal();
302 : LiteralBuffer* value;
303 4020 : if (name_literal == StaticCharVector("sourceURL")) {
304 3862 : value = &source_url_;
305 158 : } else if (name_literal == StaticCharVector("sourceMappingURL")) {
306 130 : value = &source_mapping_url_;
307 : } else {
308 : return;
309 : }
310 3992 : if (c0_ != '=')
311 : return;
312 : value->Start();
313 : Advance();
314 8004 : while (IsWhiteSpace(c0_)) {
315 : Advance();
316 : }
317 84997 : while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
318 : // Disallowed characters.
319 40413 : if (c0_ == '"' || c0_ == '\'') {
320 : value->Start();
321 : return;
322 : }
323 40393 : if (IsWhiteSpace(c0_)) {
324 : break;
325 : }
326 40353 : value->AddChar(c0_);
327 : Advance();
328 : }
329 : // Allow whitespace at the end.
330 4341 : while (c0_ != kEndOfInput && !unibrow::IsLineTerminator(c0_)) {
331 70 : if (!IsWhiteSpace(c0_)) {
332 : value->Start();
333 : break;
334 : }
335 : Advance();
336 : }
337 : }
338 :
339 58511 : Token::Value Scanner::SkipMultiLineComment() {
340 : DCHECK_EQ(c0_, '*');
341 :
342 : // Until we see the first newline, check for * and newline characters.
343 58511 : if (!next().after_line_terminator) {
344 : do {
345 : AdvanceUntil([](uc32 c0) {
346 66644 : if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) {
347 : return unibrow::IsLineTerminator(c0);
348 : }
349 66644 : uint8_t char_flags = character_scan_flags[c0];
350 : return MultilineCommentCharacterNeedsSlowPath(char_flags);
351 : });
352 :
353 8392 : while (c0_ == '*') {
354 : Advance();
355 7928 : if (c0_ == '/') {
356 : Advance();
357 7929 : return Token::WHITESPACE;
358 : }
359 : }
360 :
361 928 : if (unibrow::IsLineTerminator(c0_)) {
362 462 : next().after_line_terminator = true;
363 462 : break;
364 : }
365 2 : } while (c0_ != kEndOfInput);
366 : }
367 :
368 : // After we've seen newline, simply try to find '*/'.
369 184027 : while (c0_ != kEndOfInput) {
370 : AdvanceUntil([](uc32 c0) { return c0 == '*'; });
371 :
372 317509 : while (c0_ == '*') {
373 : Advance();
374 184069 : if (c0_ == '/') {
375 : Advance();
376 50601 : return Token::WHITESPACE;
377 : }
378 : }
379 : }
380 :
381 : return Token::ILLEGAL;
382 : }
383 :
384 2237369 : void Scanner::SkipHashBang() {
385 2242703 : if (c0_ == '#' && Peek() == '!' && source_pos() == 0) {
386 424 : SkipSingleLineComment();
387 : Scan();
388 : }
389 2237369 : }
390 :
391 73 : Token::Value Scanner::ScanHtmlComment() {
392 : // Check for <!-- comments.
393 : DCHECK_EQ(c0_, '!');
394 : Advance();
395 129 : if (c0_ != '-' || Peek() != '-') {
396 : PushBack('!'); // undo Advance()
397 34 : return Token::LT;
398 : }
399 : Advance();
400 :
401 39 : found_html_comment_ = true;
402 39 : return SkipSingleHTMLComment();
403 : }
404 :
405 : #ifdef DEBUG
406 : void Scanner::SanityCheckTokenDesc(const TokenDesc& token) const {
407 : // Only TEMPLATE_* tokens can have a invalid_template_escape_message.
408 : // ILLEGAL and UNINITIALIZED can have garbage for the field.
409 :
410 : switch (token.token) {
411 : case Token::UNINITIALIZED:
412 : case Token::ILLEGAL:
413 : // token.literal_chars & other members might be garbage. That's ok.
414 : case Token::TEMPLATE_SPAN:
415 : case Token::TEMPLATE_TAIL:
416 : break;
417 : default:
418 : DCHECK_EQ(token.invalid_template_escape_message, MessageTemplate::kNone);
419 : break;
420 : }
421 : }
422 : #endif // DEBUG
423 :
424 60823 : void Scanner::SeekForward(int pos) {
425 : // After this call, we will have the token at the given position as
426 : // the "next" token. The "current" token will be invalid.
427 60823 : if (pos == next().location.beg_pos) return;
428 : int current_pos = source_pos();
429 : DCHECK_EQ(next().location.end_pos, current_pos);
430 : // Positions inside the lookahead token aren't supported.
431 : DCHECK(pos >= current_pos);
432 60818 : if (pos != current_pos) {
433 60813 : source_->Seek(pos);
434 : Advance();
435 : // This function is only called to seek to the location
436 : // of the end of a function (at the "}" token). It doesn't matter
437 : // whether there was a line terminator in the part we skip.
438 60813 : next().after_line_terminator = false;
439 : }
440 : Scan();
441 : }
442 :
443 : template <bool capture_raw>
444 25777777 : bool Scanner::ScanEscape() {
445 25777777 : uc32 c = c0_;
446 15006 : Advance<capture_raw>();
447 :
448 : // Skip escaped newlines.
449 : DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
450 51523776 : if (!capture_raw && unibrow::IsLineTerminator(c)) {
451 : // Allow escaped CR+LF newlines in multiline string literals.
452 11054 : if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
453 : return true;
454 : }
455 :
456 25765840 : switch (c) {
457 59 : case 'b' : c = '\b'; break;
458 107 : case 'f' : c = '\f'; break;
459 473477 : case 'n' : c = '\n'; break;
460 747 : case 'r' : c = '\r'; break;
461 400 : case 't' : c = '\t'; break;
462 : case 'u' : {
463 108138 : c = ScanUnicodeEscape<capture_raw>();
464 108140 : if (c < 0) return false;
465 : break;
466 : }
467 : case 'v':
468 : c = '\v';
469 47 : break;
470 : case 'x': {
471 25053595 : c = ScanHexNumber<capture_raw>(2);
472 25051837 : if (c < 0) return false;
473 : break;
474 : }
475 : case '0': // Fall through.
476 : case '1': // fall through
477 : case '2': // fall through
478 : case '3': // fall through
479 : case '4': // fall through
480 : case '5': // fall through
481 : case '6': // fall through
482 : case '7':
483 4258 : c = ScanOctalEscape<capture_raw>(c, 2);
484 4257 : break;
485 : }
486 :
487 : // Other escaped characters are interpreted as their non-escaped version.
488 : AddLiteralChar(c);
489 : return true;
490 : }
491 :
492 : template <bool capture_raw>
493 4258 : uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
494 4258 : uc32 x = c - '0';
495 : int i = 0;
496 9260 : for (; i < length; i++) {
497 6143 : int d = c0_ - '0';
498 6143 : if (d < 0 || d > 7) break;
499 2520 : int nx = x * 8 + d;
500 2520 : if (nx >= 256) break;
501 : x = nx;
502 880 : Advance<capture_raw>();
503 : }
504 : // Anything except '\0' is an octal escape sequence, illegal in strict mode.
505 : // Remember the position of octal escape sequences so that an error
506 : // can be reported later (in strict mode).
507 : // We don't report the error immediately, because the octal escape can
508 : // occur before the "use strict" directive.
509 5293 : if (c != '0' || i > 0 || IsNonOctalDecimalDigit(c0_)) {
510 3813 : octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
511 3813 : octal_message_ = capture_raw ? MessageTemplate::kTemplateOctalLiteral
512 : : MessageTemplate::kStrictOctalEscape;
513 : }
514 4257 : return x;
515 : }
516 :
517 10488684 : Token::Value Scanner::ScanString() {
518 10488684 : uc32 quote = c0_;
519 :
520 : next().literal_chars.Start();
521 : while (true) {
522 320839009 : AdvanceUntil([this](uc32 c0) {
523 166015993 : if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) {
524 8447157 : if (V8_UNLIKELY(unibrow::IsStringLiteralLineTerminator(c0))) {
525 : return true;
526 : }
527 : AddLiteralChar(c0);
528 : return false;
529 : }
530 157568836 : uint8_t char_flags = character_scan_flags[c0];
531 157568836 : if (MayTerminateString(char_flags)) return true;
532 : AddLiteralChar(c0);
533 : return false;
534 : });
535 :
536 36955486 : while (c0_ == '\\') {
537 : Advance();
538 : // TODO(verwaest): Check whether we can remove the additional check.
539 25759922 : if (V8_UNLIKELY(c0_ == kEndOfInput || !ScanEscape<false>())) {
540 : return Token::ILLEGAL;
541 : }
542 : }
543 :
544 11192886 : if (c0_ == quote) {
545 : Advance();
546 10487108 : return Token::STRING;
547 : }
548 :
549 1411377 : if (V8_UNLIKELY(c0_ == kEndOfInput ||
550 : unibrow::IsStringLiteralLineTerminator(c0_))) {
551 : return Token::ILLEGAL;
552 : }
553 :
554 : AddLiteralChar(c0_);
555 : }
556 : }
557 :
558 288776 : Token::Value Scanner::ScanPrivateName() {
559 288776 : if (!allow_harmony_private_fields()) {
560 : ReportScannerError(source_pos(),
561 : MessageTemplate::kInvalidOrUnexpectedToken);
562 : return Token::ILLEGAL;
563 : }
564 :
565 : next().literal_chars.Start();
566 : DCHECK_EQ(c0_, '#');
567 : DCHECK(!IsIdentifierStart(kEndOfInput));
568 84196 : if (!IsIdentifierStart(Peek())) {
569 : ReportScannerError(source_pos(),
570 : MessageTemplate::kInvalidOrUnexpectedToken);
571 : return Token::ILLEGAL;
572 : }
573 :
574 : AddLiteralCharAdvance();
575 : Token::Value token = ScanIdentifierOrKeywordInner();
576 61030 : return token == Token::ILLEGAL ? Token::ILLEGAL : Token::PRIVATE_NAME;
577 : }
578 :
579 157080 : Token::Value Scanner::ScanTemplateSpan() {
580 : // When scanning a TemplateSpan, we are looking for the following construct:
581 : // TEMPLATE_SPAN ::
582 : // ` LiteralChars* ${
583 : // | } LiteralChars* ${
584 : //
585 : // TEMPLATE_TAIL ::
586 : // ` LiteralChars* `
587 : // | } LiteralChar* `
588 : //
589 : // A TEMPLATE_SPAN should always be followed by an Expression, while a
590 : // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
591 : // followed by an Expression.
592 :
593 : // These scoped helpers save and restore the original error state, so that we
594 : // can specially treat invalid escape sequences in templates (which are
595 : // handled by the parser).
596 157080 : ErrorState scanner_error_state(&scanner_error_, &scanner_error_location_);
597 157080 : ErrorState octal_error_state(&octal_message_, &octal_pos_);
598 :
599 : Token::Value result = Token::TEMPLATE_SPAN;
600 : next().literal_chars.Start();
601 : next().raw_literal_chars.Start();
602 : const bool capture_raw = true;
603 : while (true) {
604 2111947 : uc32 c = c0_;
605 2111947 : if (c == '`') {
606 : Advance(); // Consume '`'
607 : result = Token::TEMPLATE_TAIL;
608 67987 : break;
609 2131365 : } else if (c == '$' && Peek() == '{') {
610 : Advance(); // Consume '$'
611 : Advance(); // Consume '{'
612 : break;
613 1956581 : } else if (c == '\\') {
614 : Advance(); // Consume '\\'
615 : DCHECK(!unibrow::IsLineTerminator(kEndOfInput));
616 : if (capture_raw) AddRawLiteralChar('\\');
617 30332 : if (unibrow::IsLineTerminator(c0_)) {
618 : // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
619 : // code unit sequence.
620 : uc32 lastChar = c0_;
621 : Advance();
622 160 : if (lastChar == '\r') {
623 : // Also skip \n.
624 92 : if (c0_ == '\n') Advance();
625 : lastChar = '\n';
626 : }
627 : if (capture_raw) AddRawLiteralChar(lastChar);
628 : } else {
629 15006 : bool success = ScanEscape<capture_raw>();
630 : USE(success);
631 : DCHECK_EQ(!success, has_error());
632 : // For templates, invalid escape sequence checking is handled in the
633 : // parser.
634 15006 : scanner_error_state.MoveErrorTo(next_);
635 15006 : octal_error_state.MoveErrorTo(next_);
636 : }
637 1941415 : } else if (c < 0) {
638 : // Unterminated template literal
639 : break;
640 : } else {
641 : Advance(); // Consume c.
642 : // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
643 : // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
644 : // consisting of the CV 0x000A.
645 1939700 : if (c == '\r') {
646 614 : if (c0_ == '\n') Advance(); // Consume '\n'
647 : c = '\n';
648 : }
649 : if (capture_raw) AddRawLiteralChar(c);
650 : AddLiteralChar(c);
651 : }
652 : }
653 157081 : next().location.end_pos = source_pos();
654 157081 : next().token = result;
655 :
656 157081 : return result;
657 : }
658 :
659 1732839 : Handle<String> Scanner::SourceUrl(Isolate* isolate) const {
660 : Handle<String> tmp;
661 1732839 : if (source_url_.length() > 0) {
662 3827 : tmp = source_url_.Internalize(isolate);
663 : }
664 1732839 : return tmp;
665 : }
666 :
667 1732843 : Handle<String> Scanner::SourceMappingUrl(Isolate* isolate) const {
668 : Handle<String> tmp;
669 1732843 : if (source_mapping_url_.length() > 0) {
670 100 : tmp = source_mapping_url_.Internalize(isolate);
671 : }
672 1732843 : return tmp;
673 : }
674 :
675 14704243 : bool Scanner::ScanDigitsWithNumericSeparators(bool (*predicate)(uc32 ch),
676 : bool is_check_first_digit) {
677 : // we must have at least one digit after 'x'/'b'/'o'
678 14704243 : if (is_check_first_digit && !predicate(c0_)) return false;
679 :
680 : bool separator_seen = false;
681 23564742 : while (predicate(c0_) || c0_ == '_') {
682 8861164 : if (c0_ == '_') {
683 : Advance();
684 2200 : if (c0_ == '_') {
685 : ReportScannerError(Location(source_pos(), source_pos() + 1),
686 : MessageTemplate::kContinuousNumericSeparator);
687 : return false;
688 : }
689 : separator_seen = true;
690 : continue;
691 : }
692 : separator_seen = false;
693 : AddLiteralCharAdvance();
694 : }
695 :
696 14703597 : if (separator_seen) {
697 : ReportScannerError(Location(source_pos(), source_pos() + 1),
698 : MessageTemplate::kTrailingNumericSeparator);
699 : return false;
700 : }
701 :
702 : return true;
703 : }
704 :
705 14199540 : bool Scanner::ScanDecimalDigits() {
706 14199540 : if (allow_harmony_numeric_separator()) {
707 14181814 : return ScanDigitsWithNumericSeparators(&IsDecimalDigit, false);
708 : }
709 52172 : while (IsDecimalDigit(c0_)) {
710 : AddLiteralCharAdvance();
711 : }
712 : return true;
713 : }
714 :
715 34684337 : bool Scanner::ScanDecimalAsSmiWithNumericSeparators(uint64_t* value) {
716 : bool separator_seen = false;
717 162846460 : while (IsDecimalDigit(c0_) || c0_ == '_') {
718 46739079 : if (c0_ == '_') {
719 : Advance();
720 1068 : if (c0_ == '_') {
721 : ReportScannerError(Location(source_pos(), source_pos() + 1),
722 : MessageTemplate::kContinuousNumericSeparator);
723 : return false;
724 : }
725 : separator_seen = true;
726 : continue;
727 : }
728 : separator_seen = false;
729 46738011 : *value = 10 * *value + (c0_ - '0');
730 46738011 : uc32 first_char = c0_;
731 : Advance();
732 : AddLiteralChar(first_char);
733 : }
734 :
735 34684151 : if (separator_seen) {
736 : ReportScannerError(Location(source_pos(), source_pos() + 1),
737 : MessageTemplate::kTrailingNumericSeparator);
738 : return false;
739 : }
740 :
741 : return true;
742 : }
743 :
744 37668898 : bool Scanner::ScanDecimalAsSmi(uint64_t* value) {
745 37668898 : if (allow_harmony_numeric_separator()) {
746 34684178 : return ScanDecimalAsSmiWithNumericSeparators(value);
747 : }
748 :
749 19393024 : while (IsDecimalDigit(c0_)) {
750 6711792 : *value = 10 * *value + (c0_ - '0');
751 6711792 : uc32 first_char = c0_;
752 : Advance();
753 : AddLiteralChar(first_char);
754 : }
755 : return true;
756 : }
757 :
758 757 : bool Scanner::ScanBinaryDigits() {
759 757 : if (allow_harmony_numeric_separator()) {
760 435 : return ScanDigitsWithNumericSeparators(&IsBinaryDigit, true);
761 : }
762 :
763 : // we must have at least one binary digit after 'b'/'B'
764 644 : if (!IsBinaryDigit(c0_)) {
765 : return false;
766 : }
767 :
768 1140 : while (IsBinaryDigit(c0_)) {
769 : AddLiteralCharAdvance();
770 : }
771 : return true;
772 : }
773 :
774 768 : bool Scanner::ScanOctalDigits() {
775 768 : if (allow_harmony_numeric_separator()) {
776 446 : return ScanDigitsWithNumericSeparators(&IsOctalDigit, true);
777 : }
778 :
779 : // we must have at least one octal digit after 'o'/'O'
780 644 : if (!IsOctalDigit(c0_)) {
781 : return false;
782 : }
783 :
784 1300 : while (IsOctalDigit(c0_)) {
785 : AddLiteralCharAdvance();
786 : }
787 : return true;
788 : }
789 :
790 199954 : bool Scanner::ScanImplicitOctalDigits(int start_pos,
791 : Scanner::NumberKind* kind) {
792 199954 : *kind = IMPLICIT_OCTAL;
793 :
794 : while (true) {
795 : // (possible) octal number
796 808060 : if (IsNonOctalDecimalDigit(c0_)) {
797 6 : *kind = DECIMAL_WITH_LEADING_ZERO;
798 6 : return true;
799 : }
800 404024 : if (!IsOctalDigit(c0_)) {
801 : // Octal literal finished.
802 199948 : octal_pos_ = Location(start_pos, source_pos());
803 199948 : octal_message_ = MessageTemplate::kStrictOctalLiteral;
804 199948 : return true;
805 : }
806 : AddLiteralCharAdvance();
807 : }
808 : }
809 :
810 521948 : bool Scanner::ScanHexDigits() {
811 521948 : if (allow_harmony_numeric_separator()) {
812 521548 : return ScanDigitsWithNumericSeparators(&IsHexDigit, true);
813 : }
814 :
815 : // we must have at least one hex digit after 'x'/'X'
816 800 : if (!IsHexDigit(c0_)) {
817 : return false;
818 : }
819 :
820 1280 : while (IsHexDigit(c0_)) {
821 : AddLiteralCharAdvance();
822 : }
823 : return true;
824 : }
825 :
826 15665 : bool Scanner::ScanSignedInteger() {
827 15665 : if (c0_ == '+' || c0_ == '-') AddLiteralCharAdvance();
828 : // we must have at least one decimal digit after 'e'/'E'
829 31330 : if (!IsDecimalDigit(c0_)) return false;
830 15291 : return ScanDecimalDigits();
831 : }
832 :
833 38396062 : Token::Value Scanner::ScanNumber(bool seen_period) {
834 : DCHECK(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
835 :
836 38396062 : NumberKind kind = DECIMAL;
837 :
838 : next().literal_chars.Start();
839 38396062 : bool at_start = !seen_period;
840 : int start_pos = source_pos(); // For reporting octal positions.
841 38396062 : if (seen_period) {
842 : // we have already seen a decimal point of the float
843 : AddLiteralChar('.');
844 3066 : if (allow_harmony_numeric_separator() && c0_ == '_') {
845 : return Token::ILLEGAL;
846 : }
847 : // we know we have at least one digit
848 3066 : if (!ScanDecimalDigits()) return Token::ILLEGAL;
849 : } else {
850 : // if the first character is '0' we must check for octals and hex
851 38392996 : if (c0_ == '0') {
852 : AddLiteralCharAdvance();
853 :
854 : // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
855 : // an octal number.
856 16806618 : if (AsciiAlphaToLower(c0_) == 'x') {
857 : AddLiteralCharAdvance();
858 521948 : kind = HEX;
859 521948 : if (!ScanHexDigits()) return Token::ILLEGAL;
860 16284655 : } else if (AsciiAlphaToLower(c0_) == 'o') {
861 : AddLiteralCharAdvance();
862 768 : kind = OCTAL;
863 768 : if (!ScanOctalDigits()) return Token::ILLEGAL;
864 16283887 : } else if (AsciiAlphaToLower(c0_) == 'b') {
865 : AddLiteralCharAdvance();
866 757 : kind = BINARY;
867 757 : if (!ScanBinaryDigits()) return Token::ILLEGAL;
868 16283130 : } else if (IsOctalDigit(c0_)) {
869 199954 : kind = IMPLICIT_OCTAL;
870 199954 : if (!ScanImplicitOctalDigits(start_pos, &kind)) {
871 : return Token::ILLEGAL;
872 : }
873 199954 : if (kind == DECIMAL_WITH_LEADING_ZERO) {
874 : at_start = false;
875 : }
876 16083176 : } else if (IsNonOctalDecimalDigit(c0_)) {
877 48818 : kind = DECIMAL_WITH_LEADING_ZERO;
878 16034358 : } else if (allow_harmony_numeric_separator() && c0_ == '_') {
879 : ReportScannerError(Location(source_pos(), source_pos() + 1),
880 : MessageTemplate::kZeroDigitNumericSeparator);
881 : return Token::ILLEGAL;
882 : }
883 : }
884 :
885 : // Parse decimal digits and allow trailing fractional part.
886 76782330 : if (IsDecimalNumberKind(kind)) {
887 : // This is an optimization for parsing Decimal numbers as Smi's.
888 37668822 : if (at_start) {
889 37668833 : uint64_t value = 0;
890 : // scan subsequent decimal digits
891 68183458 : if (!ScanDecimalAsSmi(&value)) return Token::ILLEGAL;
892 :
893 75307938 : if (next().literal_chars.one_byte_literal().length() <= 10 &&
894 105745492 : value <= Smi::kMaxValue && c0_ != '.' && !IsIdentifierStart(c0_)) {
895 30514400 : next().smi_value_ = static_cast<uint32_t>(value);
896 :
897 30514400 : if (kind == DECIMAL_WITH_LEADING_ZERO) {
898 48818 : octal_pos_ = Location(start_pos, source_pos());
899 48818 : octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
900 : }
901 : return Token::SMI;
902 : }
903 : }
904 :
905 7154377 : if (!ScanDecimalDigits()) return Token::ILLEGAL;
906 7154400 : if (c0_ == '.') {
907 : seen_period = true;
908 : AddLiteralCharAdvance();
909 7026903 : if (allow_harmony_numeric_separator() && c0_ == '_') {
910 : return Token::ILLEGAL;
911 : }
912 7026797 : if (!ScanDecimalDigits()) return Token::ILLEGAL;
913 : }
914 : }
915 : }
916 :
917 : bool is_bigint = false;
918 7893296 : if (c0_ == 'n' && !seen_period && IsValidBigIntKind(kind)) {
919 : // Check that the literal is within our limits for BigInt length.
920 : // For simplicity, use 4 bits per character to calculate the maximum
921 : // allowed literal length.
922 : static const int kMaxBigIntCharacters = BigInt::kMaxLengthBits / 4;
923 13743 : int length = source_pos() - start_pos - (kind != DECIMAL ? 2 : 0);
924 13743 : if (length > kMaxBigIntCharacters) {
925 : ReportScannerError(Location(start_pos, source_pos()),
926 : MessageTemplate::kBigIntTooBig);
927 : return Token::ILLEGAL;
928 : }
929 :
930 : is_bigint = true;
931 : Advance();
932 7865792 : } else if (AsciiAlphaToLower(c0_) == 'e') {
933 : // scan exponent, if any
934 : DCHECK(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
935 :
936 31330 : if (!IsDecimalNumberKind(kind)) return Token::ILLEGAL;
937 :
938 : // scan exponent
939 : AddLiteralCharAdvance();
940 :
941 15665 : if (!ScanSignedInteger()) return Token::ILLEGAL;
942 : }
943 :
944 : // The source character immediately following a numeric literal must
945 : // not be an identifier start or a decimal digit; see ECMA-262
946 : // section 7.8.3, page 17 (note that we read only one decimal digit
947 : // if the value is 0).
948 23637490 : if (IsDecimalDigit(c0_) || IsIdentifierStart(c0_)) {
949 : return Token::ILLEGAL;
950 : }
951 :
952 7875512 : if (kind == DECIMAL_WITH_LEADING_ZERO) {
953 6 : octal_pos_ = Location(start_pos, source_pos());
954 6 : octal_message_ = MessageTemplate::kStrictDecimalWithLeadingZero;
955 : }
956 :
957 7875512 : return is_bigint ? Token::BIGINT : Token::NUMBER;
958 : }
959 :
960 37812 : uc32 Scanner::ScanIdentifierUnicodeEscape() {
961 : Advance();
962 37812 : if (c0_ != 'u') return -1;
963 : Advance();
964 30403 : return ScanUnicodeEscape<false>();
965 : }
966 :
967 : template <bool capture_raw>
968 138542 : uc32 Scanner::ScanUnicodeEscape() {
969 : // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of
970 : // hex digits between { } is arbitrary. \ and u have already been read.
971 138542 : if (c0_ == '{') {
972 47283 : int begin = source_pos() - 2;
973 5629 : Advance<capture_raw>();
974 47283 : uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10FFFF, begin);
975 47283 : if (cp < 0 || c0_ != '}') {
976 : ReportScannerError(source_pos(),
977 : MessageTemplate::kInvalidUnicodeEscapeSequence);
978 : return -1;
979 : }
980 757 : Advance<capture_raw>();
981 40879 : return cp;
982 : }
983 : const bool unicode = true;
984 91259 : return ScanHexNumber<capture_raw, unicode>(4);
985 : }
986 :
987 16435 : Token::Value Scanner::ScanIdentifierOrKeywordInnerSlow(bool escaped,
988 : bool can_be_keyword) {
989 : while (true) {
990 71410 : if (c0_ == '\\') {
991 : escaped = true;
992 13201 : uc32 c = ScanIdentifierUnicodeEscape();
993 : // Only allow legal identifier part characters.
994 : // TODO(verwaest): Make this true.
995 : // DCHECK(!IsIdentifierPart('\'));
996 : DCHECK(!IsIdentifierPart(-1));
997 26402 : if (c == '\\' || !IsIdentifierPart(c)) {
998 : return Token::ILLEGAL;
999 : }
1000 25164 : can_be_keyword = can_be_keyword && CharCanBeKeyword(c);
1001 : AddLiteralChar(c);
1002 132329 : } else if (IsIdentifierPart(c0_) ||
1003 16055 : (CombineSurrogatePair() && IsIdentifierPart(c0_))) {
1004 83484 : can_be_keyword = can_be_keyword && CharCanBeKeyword(c0_);
1005 : AddLiteralCharAdvance();
1006 : } else {
1007 : break;
1008 : }
1009 : }
1010 :
1011 15857 : if (can_be_keyword && next().literal_chars.is_one_byte()) {
1012 : Vector<const uint8_t> chars = next().literal_chars.one_byte_literal();
1013 : Token::Value token =
1014 : KeywordOrIdentifierToken(chars.start(), chars.length());
1015 14695 : if (IsInRange(token, Token::IDENTIFIER, Token::YIELD)) return token;
1016 :
1017 12792 : if (token == Token::FUTURE_STRICT_RESERVED_WORD) {
1018 2280 : if (escaped) return Token::ESCAPED_STRICT_RESERVED_WORD;
1019 0 : return token;
1020 : }
1021 :
1022 10512 : if (!escaped) return token;
1023 :
1024 : STATIC_ASSERT(Token::LET + 1 == Token::STATIC);
1025 10512 : if (IsInRange(token, Token::LET, Token::STATIC)) {
1026 : return Token::ESCAPED_STRICT_RESERVED_WORD;
1027 : }
1028 7260 : return Token::ESCAPED_KEYWORD;
1029 : }
1030 :
1031 : return Token::IDENTIFIER;
1032 : }
1033 :
1034 79904 : bool Scanner::ScanRegExpPattern() {
1035 : DCHECK_EQ(Token::UNINITIALIZED, next_next().token);
1036 : DCHECK(next().token == Token::DIV || next().token == Token::ASSIGN_DIV);
1037 :
1038 : // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1039 : bool in_character_class = false;
1040 :
1041 : // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1042 : // the scanner should pass uninterpreted bodies to the RegExp
1043 : // constructor.
1044 : next().literal_chars.Start();
1045 79904 : if (next().token == Token::ASSIGN_DIV) {
1046 : AddLiteralChar('=');
1047 : }
1048 :
1049 985616 : while (c0_ != '/' || in_character_class) {
1050 1811392 : if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) {
1051 : return false;
1052 : }
1053 905681 : if (c0_ == '\\') { // Escape sequence.
1054 : AddLiteralCharAdvance();
1055 184437 : if (c0_ == kEndOfInput || unibrow::IsLineTerminator(c0_)) {
1056 : return false;
1057 : }
1058 : AddLiteralCharAdvance();
1059 : // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1060 : // only "safe" characters are allowed (letters, digits, underscore),
1061 : // otherwise the escape isn't valid and the invalid character has
1062 : // its normal meaning. I.e., we can just continue scanning without
1063 : // worrying whether the following characters are part of the escape
1064 : // or not, since any '/', '\\' or '[' is guaranteed to not be part
1065 : // of the escape sequence.
1066 :
1067 : // TODO(896): At some point, parse RegExps more thoroughly to capture
1068 : // octal esacpes in strict mode.
1069 : } else { // Unescaped character.
1070 813443 : if (c0_ == '[') in_character_class = true;
1071 813443 : if (c0_ == ']') in_character_class = false;
1072 : AddLiteralCharAdvance();
1073 : }
1074 : }
1075 : Advance(); // consume '/'
1076 :
1077 79634 : next().token = Token::REGEXP_LITERAL;
1078 79634 : return true;
1079 : }
1080 :
1081 :
1082 79521 : Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() {
1083 : DCHECK_EQ(Token::REGEXP_LITERAL, next().token);
1084 :
1085 : // Scan regular expression flags.
1086 : int flags = 0;
1087 263460 : while (IsIdentifierPart(c0_)) {
1088 : RegExp::Flags flag = RegExp::kNone;
1089 35415 : switch (c0_) {
1090 : case 'g':
1091 : flag = RegExp::kGlobal;
1092 : break;
1093 : case 'i':
1094 : flag = RegExp::kIgnoreCase;
1095 4524 : break;
1096 : case 'm':
1097 : flag = RegExp::kMultiline;
1098 596 : break;
1099 : case 's':
1100 : flag = RegExp::kDotAll;
1101 156 : break;
1102 : case 'u':
1103 : flag = RegExp::kUnicode;
1104 7298 : break;
1105 : case 'y':
1106 : flag = RegExp::kSticky;
1107 141 : break;
1108 : default:
1109 : return Nothing<RegExp::Flags>();
1110 : }
1111 34931 : if (flags & flag) {
1112 : return Nothing<RegExp::Flags>();
1113 : }
1114 : Advance();
1115 34805 : flags |= flag;
1116 : }
1117 :
1118 78914 : next().location.end_pos = source_pos();
1119 78914 : return Just(RegExp::Flags(flags));
1120 : }
1121 :
1122 90707997 : const AstRawString* Scanner::CurrentSymbol(
1123 : AstValueFactory* ast_value_factory) const {
1124 90707997 : if (is_literal_one_byte()) {
1125 90637581 : return ast_value_factory->GetOneByteString(literal_one_byte_string());
1126 : }
1127 75238 : return ast_value_factory->GetTwoByteString(literal_two_byte_string());
1128 : }
1129 :
1130 568885 : const AstRawString* Scanner::NextSymbol(
1131 : AstValueFactory* ast_value_factory) const {
1132 568885 : if (is_next_literal_one_byte()) {
1133 566748 : return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
1134 : }
1135 2142 : return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
1136 : }
1137 :
1138 84013 : const AstRawString* Scanner::CurrentRawSymbol(
1139 : AstValueFactory* ast_value_factory) const {
1140 84013 : if (is_raw_literal_one_byte()) {
1141 83980 : return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
1142 : }
1143 33 : return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
1144 : }
1145 :
1146 :
1147 1309231 : double Scanner::DoubleValue() {
1148 : DCHECK(is_literal_one_byte());
1149 : return StringToDouble(
1150 : literal_one_byte_string(),
1151 1309231 : ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
1152 : }
1153 :
1154 12294 : const char* Scanner::CurrentLiteralAsCString(Zone* zone) const {
1155 : DCHECK(is_literal_one_byte());
1156 : Vector<const uint8_t> vector = literal_one_byte_string();
1157 : int length = vector.length();
1158 12294 : char* buffer = zone->NewArray<char>(length + 1);
1159 : memcpy(buffer, vector.start(), length);
1160 12293 : buffer[length] = '\0';
1161 12293 : return buffer;
1162 : }
1163 :
1164 49124 : void Scanner::SeekNext(size_t position) {
1165 : // Use with care: This cleanly resets most, but not all scanner state.
1166 : // TODO(vogelheim): Fix this, or at least DCHECK the relevant conditions.
1167 :
1168 : // To re-scan from a given character position, we need to:
1169 : // 1, Reset the current_, next_ and next_next_ tokens
1170 : // (next_ + next_next_ will be overwrittem by Next(),
1171 : // current_ will remain unchanged, so overwrite it fully.)
1172 196496 : for (TokenDesc& token : token_storage_) {
1173 147372 : token.token = Token::UNINITIALIZED;
1174 147372 : token.invalid_template_escape_message = MessageTemplate::kNone;
1175 : }
1176 : // 2, reset the source to the desired position,
1177 49124 : source_->Seek(position);
1178 : // 3, re-scan, by scanning the look-ahead char + 1 token (next_).
1179 98248 : c0_ = source_->Advance();
1180 49124 : next().after_line_terminator = false;
1181 : Scan();
1182 : DCHECK_EQ(next().location.beg_pos, static_cast<int>(position));
1183 49124 : }
1184 :
1185 : } // namespace internal
1186 121996 : } // namespace v8
|