Coverage Report

Created: 2023-11-19 07:23

/src/hermes/lib/Parser/JSLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) Meta Platforms, Inc. and affiliates.
3
 *
4
 * This source code is licensed under the MIT license found in the
5
 * LICENSE file in the root directory of this source tree.
6
 */
7
8
#include "hermes/Parser/JSLexer.h"
9
#include "hermes/Platform/Unicode/CharacterProperties.h"
10
11
#include "dtoa/dtoa.h"
12
#include "hermes/Support/Conversions.h"
13
14
#include "llvh/ADT/ScopeExit.h"
15
#include "llvh/ADT/StringSwitch.h"
16
17
namespace hermes {
18
namespace parser {
19
20
namespace {
21
22
const char *g_tokenStr[] = {
23
#define TOK(name, str) str,
24
#include "hermes/Parser/TokenKinds.def"
25
};
26
27
const int UTF8_LINE_TERMINATOR_CHAR0 = 0xe2;
28
29
405
inline bool matchUnicodeLineTerminatorOffset1(const char *curCharPtr_) {
30
  // Line separator \u2028 UTF8 encoded is      : e2 80 a8
31
  // Paragraph separator \u2029 UTF8 encoded is: e2 80 a9
32
405
  return (unsigned char)curCharPtr_[1] == 0x80 &&
33
405
      ((unsigned char)curCharPtr_[2] == 0xa8 ||
34
55
       (unsigned char)curCharPtr_[2] == 0xa9);
35
405
}
36
} // namespace
37
38
35.5k
const char *tokenKindStr(TokenKind kind) {
39
35.5k
  assert(kind <= TokenKind::_last_token);
40
0
  return g_tokenStr[static_cast<unsigned>(kind)];
41
35.5k
}
42
43
#if HERMES_PARSE_JSX
44
1
static llvh::DenseMap<llvh::StringRef, uint32_t> initializeHTMLEntities() {
45
1
  llvh::DenseMap<llvh::StringRef, uint32_t> entities{};
46
47
1
#define HTML_ENTITY(NAME, VALUE) \
48
253
  entities.insert({llvh::StringLiteral(#NAME), VALUE});
49
1
#include "hermes/Parser/HTMLEntities.def"
50
51
1
  return entities;
52
1
}
53
54
291
static const llvh::DenseMap<llvh::StringRef, uint32_t> &getHTMLEntities() {
55
291
  static const auto entities = initializeHTMLEntities();
56
291
  return entities;
57
291
}
58
#endif
59
60
JSLexer::JSLexer(
61
    uint32_t bufId,
62
    SourceErrorManager &sm,
63
    Allocator &allocator,
64
    StringTable *strTab,
65
    bool strictMode,
66
    bool convertSurrogates)
67
    : sm_(sm),
68
      allocator_(allocator),
69
      ownStrTab_(strTab ? nullptr : new StringTable(allocator_)),
70
      strTab_(strTab ? *strTab : *ownStrTab_),
71
#if HERMES_PARSE_JSX
72
      htmlEntities_(getHTMLEntities()),
73
#endif
74
      strictMode_(strictMode),
75
186
      convertSurrogates_(convertSurrogates) {
76
186
  initializeWithBufferId(bufId);
77
186
  initializeReservedIdentifiers();
78
186
}
79
80
JSLexer::JSLexer(
81
    std::unique_ptr<llvh::MemoryBuffer> input,
82
    SourceErrorManager &sm,
83
    Allocator &allocator,
84
    StringTable *strTab,
85
    bool strictMode,
86
    bool convertSurrogates)
87
    : sm_(sm),
88
      allocator_(allocator),
89
      ownStrTab_(strTab ? nullptr : new StringTable(allocator_)),
90
      strTab_(strTab ? *strTab : *ownStrTab_),
91
#if HERMES_PARSE_JSX
92
      htmlEntities_(getHTMLEntities()),
93
#endif
94
      strictMode_(strictMode),
95
105
      convertSurrogates_(convertSurrogates) {
96
105
  auto bufId = sm_.addNewSourceBuffer(std::move(input));
97
105
  initializeWithBufferId(bufId);
98
105
  initializeReservedIdentifiers();
99
105
}
100
101
291
void JSLexer::initializeWithBufferId(uint32_t bufId) {
102
291
  auto *buffer = sm_.getSourceBuffer(bufId);
103
291
  bufId_ = bufId;
104
291
  bufferStart_ = buffer->getBufferStart();
105
291
  bufferEnd_ = buffer->getBufferEnd();
106
291
  curCharPtr_ = bufferStart_;
107
291
  assert(*bufferEnd_ == 0 && "buffer must be zero terminated");
108
291
}
109
110
291
void JSLexer::initializeReservedIdentifiers() {
111
  // Add all reserved words to the identifier table
112
12.8k
#define RESWORD(name) resWordIdent(TokenKind::rw_##name) = getIdentifier(#name);
113
291
#include "hermes/Parser/TokenKinds.def"
114
291
}
115
116
8.69M
const Token *JSLexer::advance(GrammarContext grammarContext) {
117
8.69M
  newLineBeforeCurrentToken_ = false;
118
119
12.1M
  for (;;) {
120
12.1M
    assert(curCharPtr_ <= bufferEnd_ && "lexing past end of input");
121
0
#define PUNC_L1_1(ch, tok)        \
122
3.01M
  case ch:                        \
123
3.01M
    token_.setStart(curCharPtr_); \
124
3.01M
    token_.setPunctuator(tok);    \
125
3.01M
    ++curCharPtr_;                \
126
3.01M
    break
127
128
0
#define PUNC_L2_3(ch1, tok1, ch2a, tok2a, ch2b, tok2b) \
129
532k
  case ch1:                                            \
130
532k
    token_.setStart(curCharPtr_);                      \
131
532k
    if (curCharPtr_[1] == ch2a) {                      \
132
0
      token_.setPunctuator(tok2a);                     \
133
0
      curCharPtr_ += 2;                                \
134
532k
    } else if (curCharPtr_[1] == ch2b) {               \
135
0
      token_.setPunctuator(tok2b);                     \
136
0
      curCharPtr_ += 2;                                \
137
532k
    } else {                                           \
138
532k
      token_.setPunctuator(tok1);                      \
139
532k
      curCharPtr_ += 1;                                \
140
532k
    }                                                  \
141
532k
    break
142
143
0
#define PUNC_L2_2(ch1, tok1, ch2, tok2) \
144
647
  case ch1:                             \
145
647
    token_.setStart(curCharPtr_);       \
146
647
    if (curCharPtr_[1] == (ch2)) {      \
147
0
      token_.setPunctuator(tok2);       \
148
0
      curCharPtr_ += 2;                 \
149
647
    } else {                            \
150
647
      token_.setPunctuator(tok1);       \
151
647
      curCharPtr_ += 1;                 \
152
647
    }                                   \
153
647
    break
154
155
0
#define PUNC_L3_3(ch1, tok1, ch2, tok2, ch3, tok3) \
156
20
  case ch1:                                        \
157
20
    token_.setStart(curCharPtr_);                  \
158
20
    if (curCharPtr_[1] != (ch2)) {                 \
159
20
      token_.setPunctuator(tok1);                  \
160
20
      curCharPtr_ += 1;                            \
161
20
    } else if (curCharPtr_[2] == (ch3)) {          \
162
0
      token_.setPunctuator(tok3);                  \
163
0
      curCharPtr_ += 3;                            \
164
0
    } else {                                       \
165
0
      token_.setPunctuator(tok2);                  \
166
0
      curCharPtr_ += 2;                            \
167
0
    }                                              \
168
20
    break
169
170
0
    switch ((unsigned char)*curCharPtr_) {
171
293
      case 0:
172
293
        token_.setStart(curCharPtr_);
173
293
        if (curCharPtr_ == bufferEnd_) {
174
291
          token_.setEof();
175
291
        } else {
176
2
          if (!error(
177
2
                  token_.getStartLoc(),
178
2
                  "unrecognized Unicode character \\u0000")) {
179
2
            token_.setEof();
180
2
          } else {
181
0
            ++curCharPtr_;
182
0
            continue;
183
0
          }
184
2
        }
185
293
        break;
186
187
        // clang-format off
188
177k
      PUNC_L1_1('}', TokenKind::r_brace);
189
6.52k
      PUNC_L1_1('(', TokenKind::l_paren);
190
5.49k
      PUNC_L1_1(')', TokenKind::r_paren);
191
35.0k
      PUNC_L1_1('[', TokenKind::l_square);
192
33.9k
      PUNC_L1_1(']', TokenKind::r_square);
193
258k
      PUNC_L1_1(';', TokenKind::semi);
194
2.39M
      PUNC_L1_1(',', TokenKind::comma);
195
1.78k
      PUNC_L1_1('~', TokenKind::tilde);
196
95.6k
      PUNC_L1_1(':', TokenKind::colon);
197
198
      // { {|
199
3.60k
      case '{':
200
3.60k
        token_.setStart(curCharPtr_);
201
3.60k
        if (HERMES_PARSE_FLOW &&
202
3.60k
            LLVM_UNLIKELY(grammarContext == GrammarContext::Type) &&
203
3.60k
            curCharPtr_[1] == '|') {
204
0
          token_.setPunctuator(TokenKind::l_bracepipe);
205
0
          curCharPtr_ += 2;
206
3.60k
        } else {
207
3.60k
          token_.setPunctuator(TokenKind::l_brace);
208
3.60k
          curCharPtr_ += 1;
209
3.60k
        }
210
3.60k
        break;
211
212
      // = => == ===
213
341k
      case '=':
214
341k
        token_.setStart(curCharPtr_);
215
341k
        if (curCharPtr_[1] == '>') {
216
327k
          token_.setPunctuator(TokenKind::equalgreater);
217
327k
          curCharPtr_ += 2;
218
327k
        } else if (curCharPtr_[1] != '=') {
219
13.7k
          token_.setPunctuator(TokenKind::equal);
220
13.7k
          curCharPtr_ += 1;
221
13.7k
        } else if (curCharPtr_[2] == '=') {
222
8
          token_.setPunctuator(TokenKind::equalequalequal);
223
8
          curCharPtr_ += 3;
224
23
        } else {
225
23
          token_.setPunctuator(TokenKind::equalequal);
226
23
          curCharPtr_ += 2;
227
23
        }
228
341k
        break;
229
230
      // ! != !==
231
293
      PUNC_L3_3('!', TokenKind::exclaim, '=', TokenKind::exclaimequal, '=', TokenKind::exclaimequalequal);
232
233
      // + ++ +=
234
      // - -- -=
235
      // & && &=
236
      // | || |=
237
525k
      PUNC_L2_3('+', TokenKind::plus,  '+', TokenKind::plusplus,   '=', TokenKind::plusequal);
238
6.37k
      PUNC_L2_3('-', TokenKind::minus, '-', TokenKind::minusminus, '=', TokenKind::minusequal);
239
240
172
      case '&':
241
172
        token_.setStart(curCharPtr_);
242
172
        if (curCharPtr_[1] == '&') {
243
24
          if (curCharPtr_[2] == '=') {
244
2
            token_.setPunctuator(TokenKind::ampampequal);
245
2
            curCharPtr_ += 3;
246
22
          } else {
247
22
            token_.setPunctuator(TokenKind::ampamp);
248
22
            curCharPtr_ += 2;
249
22
          }
250
148
        } else if (curCharPtr_[1] == '=') {
251
2
          token_.setPunctuator(TokenKind::ampequal);
252
2
          curCharPtr_ += 2;
253
146
        } else {
254
146
          token_.setPunctuator(TokenKind::amp);
255
146
          curCharPtr_ += 1;
256
146
        }
257
172
        break;
258
259
379k
      case '|':
260
379k
        token_.setStart(curCharPtr_);
261
379k
        if (HERMES_PARSE_FLOW &&
262
379k
            LLVM_UNLIKELY(grammarContext == GrammarContext::Type) &&
263
379k
            curCharPtr_[1] == '}') {
264
0
          token_.setPunctuator(TokenKind::piper_brace);
265
0
          curCharPtr_ += 2;
266
379k
        } else {
267
379k
          if (curCharPtr_[1] == '|') {
268
58.3k
            if (curCharPtr_[2] == '=') {
269
16
              token_.setPunctuator(TokenKind::pipepipeequal);
270
16
              curCharPtr_ += 3;
271
58.2k
            } else {
272
58.2k
              token_.setPunctuator(TokenKind::pipepipe);
273
58.2k
              curCharPtr_ += 2;
274
58.2k
            }
275
320k
          } else if (curCharPtr_[1] == '=') {
276
0
            token_.setPunctuator(TokenKind::pipeequal);
277
0
            curCharPtr_ += 2;
278
320k
          } else {
279
320k
            token_.setPunctuator(TokenKind::pipe);
280
320k
            curCharPtr_ += 1;
281
320k
          }
282
379k
        }
283
379k
        break;
284
285
      // ? ?? ?.
286
2.32k
      case '?':
287
2.32k
        token_.setStart(curCharPtr_);
288
2.32k
        if (curCharPtr_[1] == '.' && !isdigit(curCharPtr_[2])) {
289
          // OptionalChainingPunctuator ::
290
          // ?. [lookahead does not contain DecimalDigit]
291
          // This is done to prevent `x?.3:y` from being recognized
292
          // as `x ?. 3 : y` instead of `x ? .3 : y`.
293
1.02k
          token_.setPunctuator(TokenKind::questiondot);
294
1.02k
          curCharPtr_ += 2;
295
1.29k
        } else if (
296
1.29k
            curCharPtr_[1] == '?' &&
297
1.29k
            LLVM_LIKELY(grammarContext != GrammarContext::Type)) {
298
0
          if (curCharPtr_[2] == '=') {
299
0
            token_.setPunctuator(TokenKind::questionquestionequal);
300
0
            curCharPtr_ += 3;
301
0
          } else {
302
0
            token_.setPunctuator(TokenKind::questionquestion);
303
0
            curCharPtr_ += 2;
304
0
          }
305
1.29k
        } else {
306
1.29k
          token_.setPunctuator(TokenKind::question);
307
1.29k
          curCharPtr_ += 1;
308
1.29k
        }
309
2.32k
        break;
310
311
      // * *= ** **=
312
519
      case '*':
313
519
        token_.setStart(curCharPtr_);
314
519
        if (curCharPtr_[1] == '=') {
315
0
          token_.setPunctuator(TokenKind::starequal);
316
0
          curCharPtr_ += 2;
317
519
        } else if (curCharPtr_[1] != '*') {
318
514
          token_.setPunctuator(TokenKind::star);
319
514
          curCharPtr_ += 1;
320
514
        } else if (curCharPtr_[2] == '=') {
321
0
          token_.setPunctuator(TokenKind::starstarequal);
322
0
          curCharPtr_ += 3;
323
5
        } else {
324
5
          token_.setPunctuator(TokenKind::starstar);
325
5
          curCharPtr_ += 2;
326
5
        }
327
519
        break;
328
329
        // * *=
330
        // ^ ^=
331
        // / /=
332
647
        PUNC_L2_2('^', TokenKind::caret, '=', TokenKind::caretequal);
333
334
      // % %=
335
26.2k
      case '%':
336
26.2k
        token_.setStart(curCharPtr_);
337
26.2k
        if (HERMES_PARSE_FLOW &&
338
26.2k
            LLVM_UNLIKELY(grammarContext == GrammarContext::Type) &&
339
26.2k
            curCharPtr_ + 7 <= bufferEnd_ &&
340
26.2k
            llvh::StringRef(curCharPtr_, 7) == "%checks") {
341
0
          token_.setIdentifier(getStringLiteral("%checks"));
342
0
          curCharPtr_ += 7;
343
26.2k
        } else if (curCharPtr_[1] == ('=')) {
344
0
          token_.setPunctuator(TokenKind::percentequal);
345
0
          curCharPtr_ += 2;
346
26.2k
        } else {
347
26.2k
          token_.setPunctuator(TokenKind::percent);
348
26.2k
          curCharPtr_ += 1;
349
26.2k
        }
350
26.2k
        break;
351
352
        // clang-format on
353
354
69
      case '\r':
355
3.49M
      case '\n':
356
3.49M
        ++curCharPtr_;
357
3.49M
        newLineBeforeCurrentToken_ = true;
358
3.49M
        continue;
359
360
      // Line separator \u2028 UTF8 encoded is      : e2 80 a8
361
      // Paragraph separator \u2029 UTF8 encoded is : e2 80 a9
362
50
      case UTF8_LINE_TERMINATOR_CHAR0:
363
50
        if (matchUnicodeLineTerminatorOffset1(curCharPtr_)) {
364
0
          curCharPtr_ += 3;
365
0
          newLineBeforeCurrentToken_ = true;
366
0
          continue;
367
50
        } else {
368
50
          goto default_label;
369
50
        }
370
371
20
      case '\v':
372
73
      case '\f':
373
73
        ++curCharPtr_;
374
73
        continue;
375
376
54
      case '\t':
377
9.73k
      case ' ':
378
        // Spaces frequently come in groups, so use a tight inner loop to skip.
379
9.73k
        do
380
9.94k
          ++curCharPtr_;
381
9.94k
        while (*curCharPtr_ == '\t' || *curCharPtr_ == ' ');
382
9.73k
        continue;
383
384
      // No-break space \u00A0 is UTF8 encoded as: c2 a0
385
0
      case 0xc2:
386
0
        if ((unsigned char)curCharPtr_[1] == 0xa0) {
387
0
          curCharPtr_ += 2;
388
0
          continue;
389
0
        } else {
390
0
          goto default_label;
391
0
        }
392
393
      // Byte-order mark \uFEFF is encoded as: ef bb bf
394
0
      case 0xef:
395
0
        if ((unsigned char)curCharPtr_[1] == 0xbb &&
396
0
            (unsigned char)curCharPtr_[2] == 0xbf) {
397
0
          curCharPtr_ += 3;
398
0
          continue;
399
0
        } else {
400
0
          goto default_label;
401
0
        }
402
403
467
      case '/':
404
467
        if (curCharPtr_[1] == '/') { // Line comment?
405
15
          scanLineComment(curCharPtr_);
406
15
          continue;
407
452
        } else if (curCharPtr_[1] == '*') { // Block comment?
408
5
          curCharPtr_ = skipBlockComment(curCharPtr_);
409
5
          continue;
410
447
        } else {
411
447
          token_.setStart(curCharPtr_);
412
447
          if (grammarContext == AllowRegExp) {
413
61
            scanRegExp();
414
386
          } else if (curCharPtr_[1] == '=') {
415
4
            token_.setPunctuator(TokenKind::slashequal);
416
4
            curCharPtr_ += 2;
417
382
          } else {
418
382
            token_.setPunctuator(TokenKind::slash);
419
382
            curCharPtr_ += 1;
420
382
          }
421
447
        }
422
447
        break;
423
424
447
      case '#':
425
0
        if (LLVM_UNLIKELY(
426
0
                curCharPtr_ == bufferStart_ && curCharPtr_[1] == '!')) {
427
          // #! (hashbang) at the very start of the buffer.
428
0
          scanLineComment(curCharPtr_);
429
0
          continue;
430
0
        }
431
0
        token_.setStart(curCharPtr_);
432
0
        if (!scanPrivateIdentifier()) {
433
0
          continue;
434
0
        }
435
0
        break;
436
437
      // <  <= << <<=
438
2
      case '<':
439
2
        token_.setStart(curCharPtr_);
440
2
        if (HERMES_PARSE_FLOW &&
441
2
            LLVM_UNLIKELY(grammarContext == JSLexer::GrammarContext::Type)) {
442
0
          token_.setPunctuator(TokenKind::less);
443
0
          curCharPtr_ += 1;
444
2
        } else if (curCharPtr_[1] == '=') {
445
0
          token_.setPunctuator(TokenKind::lessequal);
446
0
          curCharPtr_ += 2;
447
2
        } else if (curCharPtr_[1] == '<') {
448
0
          if (curCharPtr_[2] == '=') {
449
0
            token_.setPunctuator(TokenKind::lesslessequal);
450
0
            curCharPtr_ += 3;
451
0
          } else {
452
0
            token_.setPunctuator(TokenKind::lessless);
453
0
            curCharPtr_ += 2;
454
0
          }
455
2
        } else {
456
2
          token_.setPunctuator(TokenKind::less);
457
2
          curCharPtr_ += 1;
458
2
        }
459
2
        break;
460
461
      // > >= >> >>> >>= >>>=
462
21.2k
      case '>':
463
21.2k
        token_.setStart(curCharPtr_);
464
21.2k
        if ((HERMES_PARSE_FLOW &&
465
21.2k
             LLVM_UNLIKELY(grammarContext == JSLexer::GrammarContext::Type)) ||
466
21.2k
            (HERMES_PARSE_JSX &&
467
21.2k
             LLVM_UNLIKELY(
468
21.2k
                 grammarContext ==
469
21.2k
                 JSLexer::GrammarContext::AllowJSXIdentifier))) {
470
0
          token_.setPunctuator(TokenKind::greater);
471
0
          curCharPtr_ += 1;
472
21.2k
        } else if (curCharPtr_[1] == '=') { // >=
473
2
          token_.setPunctuator(TokenKind::greaterequal);
474
2
          curCharPtr_ += 2;
475
21.2k
        } else if (curCharPtr_[1] == '>') { // >>
476
24
          if (curCharPtr_[2] == '=') { // >>=
477
0
            token_.setPunctuator(TokenKind::greatergreaterequal);
478
0
            curCharPtr_ += 3;
479
24
          } else if (curCharPtr_[2] == '>') { // >>>
480
8
            if (curCharPtr_[3] == '=') { // >>>=
481
0
              token_.setPunctuator(TokenKind::greatergreatergreaterequal);
482
0
              curCharPtr_ += 4;
483
8
            } else {
484
8
              token_.setPunctuator(TokenKind::greatergreatergreater);
485
8
              curCharPtr_ += 3;
486
8
            }
487
16
          } else {
488
16
            token_.setPunctuator(TokenKind::greatergreater);
489
16
            curCharPtr_ += 2;
490
16
          }
491
21.2k
        } else {
492
21.2k
          token_.setPunctuator(TokenKind::greater);
493
21.2k
          curCharPtr_ += 1;
494
21.2k
        }
495
21.2k
        break;
496
497
11.0k
      case '.':
498
11.0k
        token_.setStart(curCharPtr_);
499
11.0k
        if (curCharPtr_[1] >= '0' && curCharPtr_[1] <= '9') {
500
4
          scanNumber(grammarContext);
501
11.0k
        } else if (curCharPtr_[1] == '.' && curCharPtr_[2] == '.') {
502
0
          token_.setPunctuator(TokenKind::dotdotdot);
503
0
          curCharPtr_ += 3;
504
11.0k
        } else {
505
11.0k
          token_.setPunctuator(TokenKind::period);
506
11.0k
          ++curCharPtr_;
507
11.0k
        }
508
11.0k
        break;
509
510
        // clang-format off
511
1.49M
      case '0': case '1': case '2': case '3': case '4':
512
2.54M
      case '5': case '6': case '7': case '8': case '9':
513
        // clang-format on
514
2.54M
        token_.setStart(curCharPtr_);
515
2.54M
        scanNumber(grammarContext);
516
2.54M
        break;
517
518
        // clang-format off
519
2.18k
      case '_': case '$':
520
290k
      case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
521
714k
      case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
522
819k
      case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
523
1.03M
      case 'v': case 'w': case 'x': case 'y': case 'z':
524
1.16M
      case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
525
1.28M
      case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
526
1.28M
      case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
527
1.28M
      case 'V': case 'W': case 'X': case 'Y': case 'Z':
528
        // clang-format on
529
1.28M
        token_.setStart(curCharPtr_);
530
1.28M
        scanIdentifierFastPathInContext(curCharPtr_, grammarContext);
531
1.28M
        break;
532
533
2
      case '@':
534
2
        token_.setStart(curCharPtr_);
535
2
        if (HERMES_PARSE_FLOW &&
536
2
            LLVM_UNLIKELY(grammarContext == GrammarContext::Type)) {
537
0
          scanIdentifierFastPathInContext(curCharPtr_, grammarContext);
538
2
        } else {
539
2
          curCharPtr_ += 1;
540
2
          errorRange(token_.getStartLoc(), "unrecognized character '@'");
541
2
          continue;
542
2
        }
543
0
        break;
544
545
0
      case '\\': {
546
0
        token_.setStart(curCharPtr_);
547
0
        tmpStorage_.clear();
548
0
        uint32_t cp = consumeUnicodeEscape();
549
0
        if (!isUnicodeIDStart(cp)) {
550
0
          errorRange(
551
0
              token_.getStartLoc(),
552
0
              "Unicode escape \\u" + Twine::utohexstr(cp) +
553
0
                  " is not a valid identifier start");
554
0
          continue;
555
0
        } else {
556
0
          appendUnicodeToStorage(cp);
557
0
        }
558
0
        scanIdentifierPartsInContext(grammarContext);
559
0
        break;
560
0
      }
561
562
1
      case '\'':
563
45
      case '"':
564
45
        token_.setStart(curCharPtr_);
565
45
        scanStringInContext(grammarContext);
566
45
        break;
567
568
526k
      case '`':
569
526k
        token_.setStart(curCharPtr_);
570
526k
        scanTemplateLiteral();
571
526k
        break;
572
573
50
      default_label:
574
71
      default: {
575
71
        token_.setStart(curCharPtr_);
576
71
        uint32_t ch = decodeUTF8();
577
578
71
        if (isUnicodeOnlyLetter(ch)) {
579
0
          tmpStorage_.clear();
580
0
          appendUnicodeToStorage(ch);
581
0
          scanIdentifierPartsInContext(grammarContext);
582
71
        } else if (isUnicodeOnlySpace(ch)) {
583
50
          continue;
584
50
        } else {
585
21
          if (ch > 31 && ch < 127)
586
0
            errorRange(
587
0
                token_.getStartLoc(),
588
0
                "unrecognized character '" + Twine((char)ch) + "'");
589
21
          else
590
21
            errorRange(
591
21
                token_.getStartLoc(),
592
21
                "unrecognized Unicode character \\u" + Twine::utohexstr(ch));
593
21
          continue;
594
21
        }
595
596
0
        break;
597
71
      }
598
12.1M
    }
599
600
    // Always terminate the loop unless "continue" was used.
601
8.69M
    break;
602
12.1M
  } // for(;;)
603
604
8.69M
  finishToken(curCharPtr_);
605
606
8.69M
  return &token_;
607
8.69M
}
608
609
#if HERMES_PARSE_JSX
610
611
0
const Token *JSLexer::advanceInJSXChild() {
612
0
  token_.setStart(curCharPtr_);
613
0
  for (;;) {
614
0
    assert(curCharPtr_ <= bufferEnd_ && "lexing past end of input");
615
0
    switch (*curCharPtr_) {
616
0
      PUNC_L1_1('{', TokenKind::l_brace);
617
0
      PUNC_L1_1('<', TokenKind::less);
618
619
0
      case 0:
620
0
        if (curCharPtr_ == bufferEnd_) {
621
0
          token_.setEof();
622
0
          break;
623
0
        }
624
        // Fall-through to start scanning text.
625
0
        [[fallthrough]];
626
627
0
      default: {
628
0
        const char *start = curCharPtr_;
629
0
        token_.setStart(start);
630
631
        // Build up cooked value using XHTML entities
632
0
        tmpStorage_.clear();
633
0
        rawStorage_.clear();
634
0
        for (;;) {
635
0
          char c = *curCharPtr_;
636
637
0
          if (LLVM_UNLIKELY(isUTF8Start(*curCharPtr_))) {
638
0
            uint32_t codepoint = _decodeUTF8SlowPath(curCharPtr_);
639
0
            appendUnicodeToStorage(codepoint);
640
0
            appendUnicodeToStorage(codepoint, rawStorage_);
641
0
            continue;
642
0
          } else if (c == '&') {
643
0
            const char *htmlStart = curCharPtr_;
644
0
            auto codePoint = consumeHTMLEntityOptional();
645
0
            if (codePoint.hasValue()) {
646
0
              appendUnicodeToStorage(*codePoint);
647
0
              rawStorage_.append(
648
0
                  {htmlStart, (size_t)(curCharPtr_ - htmlStart)});
649
0
              continue;
650
0
            }
651
0
          } else if (
652
0
              (c == 0 && curCharPtr_ == bufferEnd_) || c == '{' || c == '<') {
653
0
            token_.setJSXText(
654
0
                getStringLiteral(tmpStorage_.str()),
655
0
                getStringLiteral(rawStorage_.str()));
656
0
            break;
657
0
          }
658
0
          tmpStorage_.push_back(c);
659
0
          rawStorage_.push_back(c);
660
0
          ++curCharPtr_;
661
0
        }
662
0
        break;
663
0
      }
664
0
    }
665
666
    // Always terminate the loop unless "continue" was used.
667
0
    break;
668
0
  }
669
0
  finishToken(curCharPtr_);
670
0
  return &token_;
671
0
}
672
673
0
llvh::Optional<uint32_t> JSLexer::consumeHTMLEntityOptional() {
674
0
  assert(*curCharPtr_ == '&');
675
0
  const char *start = curCharPtr_;
676
677
0
  if (curCharPtr_[1] == '#') {
678
0
    if (curCharPtr_[2] == 'x') {
679
      // HTML entity with form &#xHEX>;
680
0
      curCharPtr_ += 3;
681
0
      const char *numberStart = curCharPtr_;
682
683
0
      uint32_t codePoint = 0;
684
0
      char ch = *curCharPtr_;
685
686
      // Calculate code point from non-empty sequence of hex digits followed by
687
      // a semicolon.
688
0
      for (;;) {
689
0
        if (ch == ';' && curCharPtr_ != numberStart) {
690
0
          curCharPtr_++;
691
0
          return codePoint;
692
0
        } else if (isdigit(ch)) {
693
0
          ch -= '0';
694
0
        } else {
695
0
          ch |= 32;
696
0
          if (ch >= 'a' && ch <= 'f') {
697
0
            ch -= 'a' - 10;
698
0
          } else {
699
0
            break;
700
0
          }
701
0
        }
702
703
        // Check that this number is representable as a code point
704
0
        codePoint = (codePoint << 4) + ch;
705
0
        if (codePoint > UNICODE_MAX_VALUE) {
706
0
          break;
707
0
        }
708
709
0
        ++curCharPtr_;
710
0
        ch = *curCharPtr_;
711
0
      }
712
0
    } else {
713
      // HTML entity with form &#NUMBER;
714
0
      curCharPtr_ += 2;
715
0
      const char *numberStart = curCharPtr_;
716
717
0
      uint32_t codePoint = 0;
718
0
      char ch = *curCharPtr_;
719
720
      // Calculate code point from non-empty sequence of decimal digits followed
721
      // by a semicolon.
722
0
      for (;;) {
723
0
        if (ch == ';' && curCharPtr_ != numberStart) {
724
0
          curCharPtr_++;
725
0
          return codePoint;
726
0
        } else if (isdigit(ch)) {
727
          // Check that this number is representable as a code point
728
0
          codePoint = codePoint * 10 + (ch - '0');
729
0
          if (codePoint > UNICODE_MAX_VALUE) {
730
0
            break;
731
0
          }
732
0
        } else {
733
0
          break;
734
0
        }
735
736
0
        ++curCharPtr_;
737
0
        ch = *curCharPtr_;
738
0
      }
739
0
    }
740
0
  } else {
741
    // HTML entity with form &NAME;
742
0
    ++curCharPtr_;
743
744
    // Gather HTML entity name and lookup name in table. HTML entity names are
745
    // composed of a sequence of up to 8 alphanumeric characters followed by a
746
    // semicolon. To minimize backtracking due to an `&` without a following
747
    // semicolon we only need to look at most 9 characters ahead (8 for the
748
    // name, 1 for the semicolon).
749
0
    for (int i = 0; i < 9; i++) {
750
0
      char ch = *curCharPtr_;
751
0
      if (ch == ';') {
752
0
        auto it = htmlEntities_.find(llvh::StringRef(curCharPtr_ - i, i));
753
0
        if (it == htmlEntities_.end()) {
754
0
          break;
755
0
        }
756
757
0
        curCharPtr_++;
758
0
        return it->second;
759
0
      } else if (((ch | 32) >= 'a' && (ch | 32) <= 'z') || isdigit(ch)) {
760
0
        ++curCharPtr_;
761
0
      } else {
762
0
        break;
763
0
      }
764
0
    }
765
0
  }
766
767
0
  curCharPtr_ = start;
768
0
  return llvh::None;
769
0
}
770
771
#endif
772
773
0
bool JSLexer::isCurrentTokenADirective() {
774
  // The current token must be a string literal without escapes.
775
0
  if (token_.getKind() != TokenKind::string_literal ||
776
0
      token_.getStringLiteralContainsEscapes()) {
777
0
    return false;
778
0
  }
779
780
0
  const char *ptr = curCharPtr_;
781
782
  // A directive is a string literal (the current token, directly behind
783
  // curCharPtr_), followed by a semicolon, new line, or eof that we will now
784
  // try to find. There can also be comments. So, we loop, consuming whitespace
785
  // until we encounter:
786
  // - EOF. Don't consume it and succeed.
787
  // - Semicolon. Don't consume it and succeed.
788
  // - Right brace. Don't consume it and succeed.
789
  // - A new line. Don't consume it and succeed.
790
  // - A line comment. It implies a new line. Don't consume it and succeed.
791
  // - A block comment. Consume it and continue.
792
  // - Anything else. We consume nothing and fail.
793
794
0
  for (;;) {
795
0
    assert(ptr <= bufferEnd_ && "lexing past end of input");
796
797
0
    switch (*((const unsigned char *)ptr)) {
798
0
      case 0:
799
        // EOF?
800
0
        if (ptr == bufferEnd_)
801
0
          return true;
802
        // We encountered a stray 0 character.
803
0
        return false;
804
805
0
      case ';':
806
0
      case '}':
807
0
        return true;
808
809
0
      case '\r':
810
0
      case '\n':
811
0
        return true;
812
813
      // Line separator \u2028 UTF8 encoded is      : e2 80 a8
814
      // Paragraph separator \u2029 UTF8 encoded is : e2 80 a9
815
0
      case UTF8_LINE_TERMINATOR_CHAR0:
816
0
        if (matchUnicodeLineTerminatorOffset1(ptr))
817
0
          return true;
818
0
        return false;
819
820
0
      case '\v':
821
0
      case '\f':
822
        // Skip whitespace.
823
0
        ++ptr;
824
0
        continue;
825
826
0
      case '\t':
827
0
      case ' ':
828
        // Spaces frequently come in groups, so use a tight inner loop to skip.
829
0
        do
830
0
          ++ptr;
831
0
        while (*ptr == '\t' || *ptr == ' ');
832
0
        continue;
833
834
      // No-break space \u00A0 is UTF8 encoded as: c2 a0
835
0
      case 0xc2:
836
0
        if ((unsigned char)ptr[1] == 0xa0) {
837
0
          ptr += 2;
838
0
          continue;
839
0
        } else {
840
0
          goto default_label;
841
0
        }
842
843
      // Byte-order mark \uFEFF is encoded as: ef bb bf
844
0
      case 0xef:
845
0
        if ((unsigned char)ptr[1] == 0xbb && (unsigned char)ptr[2] == 0xbf) {
846
0
          ptr += 3;
847
0
          continue;
848
0
        } else {
849
0
          goto default_label;
850
0
        }
851
852
0
      case '/':
853
0
        if (ptr[1] == '/') { // Line comment?
854
          // It implies a new line, so we are good.
855
0
          return true;
856
0
        } else if (ptr[1] == '*') { // Block comment?
857
0
          auto savedCommentStorageSize = commentStorage_.size();
858
0
          auto commentScope = llvh::make_scope_exit([&] {
859
0
            if (storeComments_)
860
0
              commentStorage_.erase(
861
0
                  commentStorage_.begin() + savedCommentStorageSize,
862
0
                  commentStorage_.end());
863
0
          });
864
0
          SourceErrorManager::SaveAndSuppressMessages suppress(&sm_);
865
0
          ptr = skipBlockComment(ptr);
866
0
          continue;
867
0
        } else {
868
0
          return false;
869
0
        }
870
871
      // Handle all other characters: if it is a unicode space, skip it.
872
      // Otherwise we have failed.
873
0
      default_label:
874
0
      default: {
875
0
        if (hermes::isUTF8Start(*ptr)) {
876
0
          auto peeked = _peekUTF8(ptr);
877
0
          if (isUnicodeOnlySpace(peeked.first)) {
878
0
            ptr = peeked.second;
879
0
            continue;
880
0
          }
881
0
        }
882
0
        return false;
883
0
      }
884
0
    }
885
0
  }
886
887
  // We arrive here if we matched a directive. 'ptr' is the final character.
888
0
  return true;
889
0
}
890
891
174k
const Token *JSLexer::rescanRBraceInTemplateLiteral() {
892
174k
  assert(token_.getKind() == TokenKind::r_brace && "need } to rescan");
893
0
  --curCharPtr_;
894
  // Undo the storage for the '}'.
895
174k
  if (LLVM_UNLIKELY(storeTokens_)) {
896
0
    tokenStorage_.pop_back();
897
0
  }
898
174k
  assert(*curCharPtr_ == '}' && "non-} was scanned as r_brace");
899
0
  token_.setStart(curCharPtr_);
900
174k
  scanTemplateLiteral();
901
174k
  finishToken(curCharPtr_);
902
174k
  return &token_;
903
174k
}
904
905
152
OptValue<TokenKind> JSLexer::lookahead1(OptValue<TokenKind> expectedToken) {
906
  // We support TokenKind::question here because of Flow's render types.
907
  // `renders?` is not a token itself (as making it a token would be bad for
908
  // identifier parsing performance). When we are parsing something like
909
  // (renders?: number) => string and the cursor is under the `?`, we need to
910
  // perform a lookahead to see if the next token is a colon, in which case
911
  // this is a function parameter, and if not then parse as a render type.
912
152
  assert(
913
152
      (token_.getKind() == TokenKind::identifier || token_.isResWord() ||
914
152
       token_.getKind() == TokenKind::question) &&
915
152
      "unsupported current token");
916
0
  UniqueString *savedIdent;
917
152
  if (token_.getKind() == TokenKind::identifier || token_.isResWord()) {
918
152
    savedIdent = token_.getResWordOrIdentifier();
919
152
  }
920
152
  TokenKind savedKind = token_.getKind();
921
152
  SMLoc start = token_.getStartLoc();
922
152
  SMLoc end = token_.getEndLoc();
923
152
  const char *cur = curCharPtr_;
924
152
  SourceErrorManager::SaveAndSuppressMessages suppress(&sm_);
925
926
  // Remove any comments that were stored during the lookahead
927
152
  auto savedCommentStorageSize = commentStorage_.size();
928
152
  auto commentScope = llvh::make_scope_exit([&] {
929
152
    if (storeComments_)
930
0
      commentStorage_.erase(
931
0
          commentStorage_.begin() + savedCommentStorageSize,
932
0
          commentStorage_.end());
933
152
  });
934
935
152
  advance();
936
152
  OptValue<TokenKind> kind = token_.getKind();
937
152
  if (isNewLineBeforeCurrentToken()) {
938
    // Disregard anything after LineTerminator.
939
140
    kind = llvh::None;
940
140
  } else if (expectedToken == kind) {
941
    // Do not move the cursor back.
942
4
    return kind;
943
4
  }
944
945
148
  token_.setStart(start.getPointer());
946
148
  token_.setEnd(end.getPointer());
947
148
  if (savedKind == TokenKind::identifier) {
948
148
    token_.setIdentifier(savedIdent);
949
148
  } else if (savedKind == TokenKind::question) {
950
0
    token_.setPunctuator(TokenKind::question);
951
0
  } else {
952
0
    token_.setResWord(savedKind, savedIdent);
953
0
  }
954
148
  seek(SMLoc::getFromPointer(cur));
955
956
  // Undo the storage for the token we just advanced to.
957
148
  if (LLVM_UNLIKELY(storeTokens_)) {
958
0
    tokenStorage_.pop_back();
959
0
  }
960
961
148
  return kind;
962
152
}
963
964
0
uint32_t JSLexer::consumeUnicodeEscape() {
965
0
  assert(*curCharPtr_ == '\\');
966
0
  ++curCharPtr_;
967
968
0
  if (*curCharPtr_ != 'u') {
969
0
    error(
970
0
        {SMLoc::getFromPointer(curCharPtr_ - 1),
971
0
         SMLoc::getFromPointer(curCharPtr_ + 1)},
972
0
        "invalid Unicode escape");
973
0
    return UNICODE_REPLACEMENT_CHARACTER;
974
0
  }
975
0
  ++curCharPtr_;
976
977
0
  if (*curCharPtr_ == '{') {
978
0
    auto cp = consumeBracedCodePoint();
979
0
    if (!cp.hasValue()) {
980
      // consumeBracedCodePoint has reported an error.
981
0
      return UNICODE_REPLACEMENT_CHARACTER;
982
0
    }
983
0
    return *cp;
984
0
  }
985
986
0
  auto cp = consumeHex(4);
987
0
  if (!cp)
988
0
    return UNICODE_REPLACEMENT_CHARACTER;
989
990
  // We don't need t check for valid UTF-16. JavaScript allows invalid surrogate
991
  // pairs, so we just encode every UTF-16 code into a UTF-8 sequence, even
992
  // though theoretically it is not a valid UTF-8. (UTF-8 would be "valid" if we
993
  // collected the surrogate pair, decoded it into UTF-32 and encoded that into
994
  // UTF-16).
995
0
  return cp.getValue();
996
0
}
997
998
192
llvh::Optional<uint32_t> JSLexer::consumeUnicodeEscapeOptional() {
999
192
  const char *start = curCharPtr_;
1000
192
  assert(*curCharPtr_ == '\\');
1001
0
  ++curCharPtr_;
1002
1003
192
  if (*curCharPtr_ != 'u') {
1004
0
    curCharPtr_ = start;
1005
0
    return llvh::None;
1006
0
  }
1007
192
  ++curCharPtr_;
1008
1009
192
  if (*curCharPtr_ == '{') {
1010
    // Avoid reporting an error because we are consuming the escape optionally.
1011
78
    auto cp = consumeBracedCodePoint(false);
1012
78
    if (!cp) {
1013
0
      curCharPtr_ = start;
1014
0
      return llvh::None;
1015
0
    }
1016
78
    return *cp;
1017
78
  }
1018
1019
114
  auto cp = consumeHex(4, false);
1020
114
  if (!cp) {
1021
92
    curCharPtr_ = start;
1022
92
    return llvh::None;
1023
92
  }
1024
1025
  // We don't need t check for valid UTF-16. JavaScript allows invalid surrogate
1026
  // pairs, so we just encode every UTF-16 code into a UTF-8 sequence, even
1027
  // though theoretically it is not a valid UTF-8. (UTF-8 would be "valid" if we
1028
  // collected the surrogate pair, decoded it into UTF-32 and encoded that into
1029
  // UTF-16).
1030
22
  return cp.getValue();
1031
114
}
1032
1033
2.54M
bool JSLexer::consumeIdentifierStart() {
1034
2.54M
  if (*curCharPtr_ == '_' || *curCharPtr_ == '$' ||
1035
2.54M
      ((*curCharPtr_ | 32) >= 'a' && (*curCharPtr_ | 32) <= 'z')) {
1036
11
    tmpStorage_.clear();
1037
11
    tmpStorage_.push_back(*curCharPtr_++);
1038
11
    return true;
1039
11
  }
1040
1041
2.54M
  if (*curCharPtr_ == '\\') {
1042
0
    SMLoc startLoc = SMLoc::getFromPointer(curCharPtr_);
1043
0
    tmpStorage_.clear();
1044
0
    uint32_t cp = consumeUnicodeEscape();
1045
0
    if (!isUnicodeIDStart(cp)) {
1046
0
      errorRange(
1047
0
          startLoc,
1048
0
          "Unicode escape \\u" + Twine::utohexstr(cp) +
1049
0
              "is not a valid identifier start");
1050
0
    } else {
1051
0
      appendUnicodeToStorage(cp);
1052
0
    }
1053
0
    return true;
1054
0
  }
1055
1056
2.54M
  if (LLVM_LIKELY(!isUTF8Start(*curCharPtr_)))
1057
2.54M
    return false;
1058
1059
0
  auto decoded = _peekUTF8();
1060
0
  if (isUnicodeIDStart(decoded.first)) {
1061
0
    tmpStorage_.clear();
1062
0
    appendUnicodeToStorage(decoded.first);
1063
0
    curCharPtr_ = decoded.second;
1064
0
    return true;
1065
0
  }
1066
1067
0
  return false;
1068
0
}
1069
1070
template <JSLexer::IdentifierMode Mode>
1071
279k
bool JSLexer::consumeOneIdentifierPartNoEscape() {
1072
279k
  char ch = *curCharPtr_;
1073
279k
  if (ch == '_' || ch == '$' || ((ch | 32) >= 'a' && (ch | 32) <= 'z') ||
1074
279k
      (ch >= '0' && ch <= '9') || (Mode == IdentifierMode::JSX && ch == '-') ||
1075
279k
      (Mode == IdentifierMode::Flow && ch == '@')) {
1076
273k
    tmpStorage_.push_back(*curCharPtr_++);
1077
273k
    return true;
1078
273k
  } else if (LLVM_UNLIKELY(isUTF8Start(ch))) {
1079
    // If we have encountered a Unicode character, we try to decode it. If it
1080
    // can be a part of the identifier, we consume it, otherwise we leave it
1081
    // alone.
1082
3.84k
    auto decoded = _peekUTF8();
1083
3.84k
    if (isUnicodeIDContinue(decoded.first)) {
1084
3.84k
      appendUnicodeToStorage(decoded.first);
1085
3.84k
      curCharPtr_ = decoded.second;
1086
3.84k
      return true;
1087
3.84k
    }
1088
3.84k
  }
1089
2.21k
  return false;
1090
279k
}
Unexecuted instantiation: bool hermes::parser::JSLexer::consumeOneIdentifierPartNoEscape<(hermes::parser::JSLexer::IdentifierMode)1>()
Unexecuted instantiation: bool hermes::parser::JSLexer::consumeOneIdentifierPartNoEscape<(hermes::parser::JSLexer::IdentifierMode)2>()
bool hermes::parser::JSLexer::consumeOneIdentifierPartNoEscape<(hermes::parser::JSLexer::IdentifierMode)0>()
Line
Count
Source
1071
279k
bool JSLexer::consumeOneIdentifierPartNoEscape() {
1072
279k
  char ch = *curCharPtr_;
1073
279k
  if (ch == '_' || ch == '$' || ((ch | 32) >= 'a' && (ch | 32) <= 'z') ||
1074
279k
      (ch >= '0' && ch <= '9') || (Mode == IdentifierMode::JSX && ch == '-') ||
1075
279k
      (Mode == IdentifierMode::Flow && ch == '@')) {
1076
273k
    tmpStorage_.push_back(*curCharPtr_++);
1077
273k
    return true;
1078
273k
  } else if (LLVM_UNLIKELY(isUTF8Start(ch))) {
1079
    // If we have encountered a Unicode character, we try to decode it. If it
1080
    // can be a part of the identifier, we consume it, otherwise we leave it
1081
    // alone.
1082
3.84k
    auto decoded = _peekUTF8();
1083
3.84k
    if (isUnicodeIDContinue(decoded.first)) {
1084
3.84k
      appendUnicodeToStorage(decoded.first);
1085
3.84k
      curCharPtr_ = decoded.second;
1086
3.84k
      return true;
1087
3.84k
    }
1088
3.84k
  }
1089
2.21k
  return false;
1090
279k
}
1091
1092
template <JSLexer::IdentifierMode Mode>
1093
2.14k
void JSLexer::consumeIdentifierParts() {
1094
24.4k
  for (;;) {
1095
    // Try consuming an non-escaped identifier part. Failing that, check for an
1096
    // escape.
1097
24.4k
    if (consumeOneIdentifierPartNoEscape<Mode>())
1098
22.3k
      continue;
1099
2.14k
    else if (*curCharPtr_ == '\\') {
1100
      // Decode the escape.
1101
0
      SMLoc startLoc = SMLoc::getFromPointer(curCharPtr_);
1102
0
      uint32_t cp = consumeUnicodeEscape();
1103
0
      if (!isUnicodeIDContinue(cp)) {
1104
0
        errorRange(
1105
0
            startLoc,
1106
0
            "Unicode escape \\u" + Twine::utohexstr(cp) +
1107
0
                " is not a valid identifier codepoint");
1108
0
      } else {
1109
0
        appendUnicodeToStorage(cp);
1110
0
      }
1111
0
    } else
1112
2.14k
      break;
1113
24.4k
  }
1114
2.14k
}
Unexecuted instantiation: void hermes::parser::JSLexer::consumeIdentifierParts<(hermes::parser::JSLexer::IdentifierMode)1>()
Unexecuted instantiation: void hermes::parser::JSLexer::consumeIdentifierParts<(hermes::parser::JSLexer::IdentifierMode)2>()
void hermes::parser::JSLexer::consumeIdentifierParts<(hermes::parser::JSLexer::IdentifierMode)0>()
Line
Count
Source
1093
2.14k
void JSLexer::consumeIdentifierParts() {
1094
24.4k
  for (;;) {
1095
    // Try consuming an non-escaped identifier part. Failing that, check for an
1096
    // escape.
1097
24.4k
    if (consumeOneIdentifierPartNoEscape<Mode>())
1098
22.3k
      continue;
1099
2.14k
    else if (*curCharPtr_ == '\\') {
1100
      // Decode the escape.
1101
0
      SMLoc startLoc = SMLoc::getFromPointer(curCharPtr_);
1102
0
      uint32_t cp = consumeUnicodeEscape();
1103
0
      if (!isUnicodeIDContinue(cp)) {
1104
0
        errorRange(
1105
0
            startLoc,
1106
0
            "Unicode escape \\u" + Twine::utohexstr(cp) +
1107
0
                " is not a valid identifier codepoint");
1108
0
      } else {
1109
0
        appendUnicodeToStorage(cp);
1110
0
      }
1111
0
    } else
1112
2.14k
      break;
1113
24.4k
  }
1114
2.14k
}
1115
1116
2
unsigned char JSLexer::consumeOctal(unsigned maxLen) {
1117
2
  assert(*curCharPtr_ >= '0' && *curCharPtr_ <= '7');
1118
1119
2
  if (strictMode_) {
1120
0
    if (!error(
1121
0
            SMLoc::getFromPointer(curCharPtr_ - 1),
1122
0
            "octals not allowed in strict mode")) {
1123
0
      return 0;
1124
0
    }
1125
0
  }
1126
1127
2
  auto res = (unsigned char)(*curCharPtr_++ - '0');
1128
2
  while (--maxLen && *curCharPtr_ >= '0' && *curCharPtr_ <= '7')
1129
0
    res = (res << 3) + *curCharPtr_++ - '0';
1130
1131
2
  return res;
1132
2
}
1133
1134
llvh::Optional<uint32_t> JSLexer::consumeHex(
1135
    unsigned requiredLen,
1136
116
    bool errorOnFail) {
1137
116
  uint32_t cp = 0;
1138
226
  for (unsigned i = 0; i != requiredLen; ++i) {
1139
202
    unsigned ch = *curCharPtr_;
1140
202
    if (ch >= '0' && ch <= '9') {
1141
76
      ch -= '0';
1142
126
    } else {
1143
      // Now that we know it is not a digit, it is safe to lowercase.
1144
126
      ch |= 32;
1145
126
      if (ch >= 'a' && ch <= 'f') {
1146
34
        ch -= 'a' - 10;
1147
92
      } else {
1148
92
        if (errorOnFail) {
1149
0
          error(SMLoc::getFromPointer(curCharPtr_), "invalid hex number");
1150
0
        }
1151
92
        return llvh::None;
1152
92
      }
1153
126
    }
1154
110
    cp = (cp << 4) + ch;
1155
110
    ++curCharPtr_;
1156
110
  }
1157
1158
24
  return cp;
1159
116
}
1160
1161
78
llvh::Optional<uint32_t> JSLexer::consumeBracedCodePoint(bool errorOnFail) {
1162
78
  assert(*curCharPtr_ == '{' && "braced codepoint must begin with {");
1163
0
  ++curCharPtr_;
1164
78
  const char *start = curCharPtr_;
1165
1166
  // Set to true if we failed to get a code point that is in bounds or saw
1167
  // an invalid character.
1168
78
  bool failed = false;
1169
1170
  // Loop until we hit the } or eof, max out the value, or see an invalid char.
1171
78
  uint32_t cp = 0;
1172
156
  for (; *curCharPtr_ != '}'; ++curCharPtr_) {
1173
78
    int ch = *curCharPtr_;
1174
78
    if (ch >= '0' && ch <= '9') {
1175
0
      ch -= '0';
1176
78
    } else if (ch >= 'a' && ch <= 'f') {
1177
78
      ch -= 'a' - 10;
1178
78
    } else if (ch >= 'A' && ch <= 'F') {
1179
0
      ch -= 'A' - 10;
1180
0
    } else {
1181
      // The only way this can be the end of the buffer is if this is a \0.
1182
      // Check if this is the end of the buffer, else continue so that we
1183
      // may report more errors after this braced code point.
1184
0
      if (curCharPtr_ == bufferEnd_) {
1185
0
        if (!failed && errorOnFail) {
1186
0
          error(
1187
0
              SMLoc::getFromPointer(start),
1188
0
              "non-terminated unicode codepoint escape");
1189
0
        }
1190
0
        return llvh::None;
1191
0
      }
1192
      // Invalid character, set the failed flag and continue.
1193
0
      if (!failed && errorOnFail) {
1194
0
        if (!error(
1195
0
                SMLoc::getFromPointer(curCharPtr_),
1196
0
                "invalid character in unicode codepoint escape")) {
1197
0
          return llvh::None;
1198
0
        }
1199
0
      }
1200
0
      failed = true;
1201
0
      continue;
1202
0
    }
1203
78
    cp = (cp << 4) + ch;
1204
78
    if (cp > UNICODE_MAX_VALUE) {
1205
      // Number grew too big, set the failed flag and continue.
1206
0
      if (!failed && errorOnFail) {
1207
0
        if (!error(
1208
0
                SMLoc::getFromPointer(start),
1209
0
                "unicode codepoint escape is too large")) {
1210
0
          return llvh::None;
1211
0
        }
1212
0
      }
1213
0
      failed = true;
1214
0
    }
1215
78
  }
1216
1217
78
  assert(curCharPtr_ < bufferEnd_ && "bufferEnd_ should cause early return");
1218
1219
  // An empty escape sequence is invalid.
1220
78
  if (curCharPtr_ == start) {
1221
0
    if (!failed && errorOnFail) {
1222
0
      if (!error(
1223
0
              SMLoc::getFromPointer(start), "empty unicode codepoint escape")) {
1224
0
        return llvh::None;
1225
0
      }
1226
0
    }
1227
0
    failed = true;
1228
0
  }
1229
1230
  // Consume the final } and return.
1231
78
  ++curCharPtr_;
1232
78
  return failed ? llvh::None : llvh::Optional<uint32_t>{cp};
1233
78
}
1234
1235
15
llvh::StringRef JSLexer::lineCommentHelper(const char *start) {
1236
15
  assert(
1237
15
      (start[0] == '/' && start[1] == '/') ||
1238
15
      (start[0] == '#' && start[1] == '!'));
1239
0
  const char *lineCommentEnd;
1240
15
  const char *cur = start + 2;
1241
1242
2.93M
  for (;;) {
1243
2.93M
    switch ((unsigned char)*cur) {
1244
10
      case 0:
1245
10
        if (cur == bufferEnd_) {
1246
10
          lineCommentEnd = cur;
1247
10
          goto endLoop;
1248
10
        } else {
1249
0
          ++cur;
1250
0
        }
1251
0
        break;
1252
1253
0
      case '\r':
1254
5
      case '\n':
1255
5
        lineCommentEnd = cur;
1256
5
        ++cur;
1257
5
        newLineBeforeCurrentToken_ = true;
1258
5
        goto endLoop;
1259
1260
        // Line separator \u2028 UTF8 encoded is      : e2 80 a8
1261
        // Paragraph separator \u2029 UTF8 encoded is: e2 80 a9
1262
0
      case UTF8_LINE_TERMINATOR_CHAR0:
1263
0
        if (matchUnicodeLineTerminatorOffset1(cur)) {
1264
0
          lineCommentEnd = cur;
1265
0
          cur += 3;
1266
0
          newLineBeforeCurrentToken_ = true;
1267
0
          goto endLoop;
1268
0
        } else {
1269
0
          _decodeUTF8SlowPath(cur);
1270
0
        }
1271
0
        break;
1272
1273
2.93M
      default:
1274
2.93M
        if (LLVM_UNLIKELY(isUTF8Start(*cur)))
1275
3
          _decodeUTF8SlowPath(cur);
1276
2.93M
        else
1277
2.93M
          ++cur;
1278
2.93M
        break;
1279
2.93M
    }
1280
2.93M
  }
1281
15
endLoop:
1282
1283
15
  curCharPtr_ = cur;
1284
15
  return llvh::StringRef(start, lineCommentEnd - start);
1285
15
}
1286
1287
15
void JSLexer::scanLineComment(const char *start) {
1288
15
  llvh::StringRef comment = lineCommentHelper(start);
1289
1290
15
  if (storeComments_) {
1291
0
    commentStorage_.emplace_back(
1292
0
        start[0] == '/' ? StoredComment::Kind::Line
1293
0
                        : StoredComment::Kind::Hashbang,
1294
0
        SMRange{
1295
0
            SMLoc::getFromPointer(comment.begin()),
1296
0
            SMLoc::getFromPointer(comment.end())});
1297
0
  }
1298
1299
  // Check for magic comments, which excludes #!.
1300
  // Syntax is //# name=value
1301
15
  if (!comment.consume_front(llvh::StringLiteral("//# ")))
1302
5
    return;
1303
1304
10
  if (comment.consume_front(llvh::StringLiteral("sourceURL=")))
1305
0
    sm_.setSourceUrl(bufId_, comment);
1306
10
  else if (comment.consume_front(llvh::StringLiteral("sourceMappingURL=")))
1307
0
    sm_.setSourceMappingUrl(bufId_, comment);
1308
10
}
1309
1310
5
const char *JSLexer::skipBlockComment(const char *start) {
1311
5
  assert(start[0] == '/' && start[1] == '*');
1312
0
  SMLoc blockCommentStart = SMLoc::getFromPointer(start);
1313
5
  const char *cur = start + 2;
1314
1315
1.30M
  for (;;) {
1316
1.30M
    switch ((unsigned char)*cur) {
1317
517k
      case 0:
1318
517k
        if (cur == bufferEnd_) {
1319
5
          error(SMLoc::getFromPointer(cur), "non-terminated block comment");
1320
5
          sm_.note(blockCommentStart, "comment started here");
1321
5
          goto endLoop;
1322
517k
        } else {
1323
517k
          ++cur;
1324
517k
        }
1325
517k
        break;
1326
1327
517k
      case '\r':
1328
13.7k
      case '\n':
1329
13.7k
        ++cur;
1330
13.7k
        newLineBeforeCurrentToken_ = true;
1331
13.7k
        break;
1332
1333
      // Line separator \u2028 UTF8 encoded is      : e2 80 a8
1334
      // Paragraph separator \u2029 UTF8 encoded is: e2 80 a9
1335
355
      case UTF8_LINE_TERMINATOR_CHAR0:
1336
355
        if (matchUnicodeLineTerminatorOffset1(cur)) {
1337
0
          cur += 3;
1338
0
          newLineBeforeCurrentToken_ = true;
1339
355
        } else {
1340
355
          _decodeUTF8SlowPath(cur);
1341
355
        }
1342
355
        break;
1343
1344
4.25k
      case '*':
1345
4.25k
        ++cur;
1346
4.25k
        if (*cur == '/') {
1347
0
          ++cur;
1348
0
          goto endLoop;
1349
0
        }
1350
4.25k
        break;
1351
1352
767k
      default:
1353
767k
        if (LLVM_UNLIKELY(isUTF8Start(*cur)))
1354
183k
          _decodeUTF8SlowPath(cur);
1355
584k
        else
1356
584k
          ++cur;
1357
767k
        break;
1358
1.30M
    }
1359
1.30M
  }
1360
5
endLoop:
1361
1362
5
  if (storeComments_) {
1363
0
    commentStorage_.emplace_back(
1364
0
        StoredComment::Kind::Block,
1365
0
        SMRange{blockCommentStart, SMLoc::getFromPointer(cur)});
1366
0
  }
1367
1368
5
  return cur;
1369
5
}
1370
1371
2.54M
void JSLexer::scanNumber(GrammarContext grammarContext) {
1372
  // A somewhat ugly state machine for scanning a number
1373
1374
2.54M
  unsigned radix = 10;
1375
2.54M
  bool real = false;
1376
2.54M
  bool ok = true;
1377
2.54M
  const char *rawStart = curCharPtr_;
1378
2.54M
  const char *start = curCharPtr_;
1379
1380
  // True when we encounter the numeric literal separator: '_'.
1381
2.54M
  bool seenSeparator = false;
1382
1383
  // True when we encounter a legacy octal number (starts with '0').
1384
2.54M
  bool legacyOctal = false;
1385
1386
  // Detect the radix
1387
2.54M
  if (*curCharPtr_ == '0') {
1388
1.31M
    if ((curCharPtr_[1] | 32) == 'x') {
1389
0
      radix = 16;
1390
0
      curCharPtr_ += 2;
1391
0
      start += 2;
1392
1.31M
    } else if ((curCharPtr_[1] | 32) == 'o') {
1393
0
      radix = 8;
1394
0
      curCharPtr_ += 2;
1395
0
      start += 2;
1396
1.31M
    } else if ((curCharPtr_[1] | 32) == 'b') {
1397
0
      radix = 2;
1398
0
      curCharPtr_ += 2;
1399
0
      start += 2;
1400
1.31M
    } else if (curCharPtr_[1] == '.') {
1401
178
      curCharPtr_ += 2;
1402
178
      goto fraction;
1403
1.31M
    } else if ((curCharPtr_[1] | 32) == 'e') {
1404
288
      curCharPtr_ += 2;
1405
288
      goto exponent;
1406
1.31M
    } else {
1407
1.31M
      radix = 8;
1408
1.31M
      legacyOctal = true;
1409
1.31M
      ++curCharPtr_;
1410
1.31M
    }
1411
1.31M
  }
1412
1413
10.1M
  while (isdigit(*curCharPtr_) ||
1414
10.1M
         (radix == 16 && (*curCharPtr_ | 32) >= 'a' &&
1415
2.54M
          (*curCharPtr_ | 32) <= 'f') ||
1416
10.1M
         (*curCharPtr_ == '_')) {
1417
7.65M
    seenSeparator |= *curCharPtr_ == '_';
1418
7.65M
    ++curCharPtr_;
1419
7.65M
  }
1420
1421
2.54M
  if (radix == 10 || legacyOctal) {
1422
    // It is not necessarily an integer.
1423
    // We could have interpreted as legacyOctal initially but will have to
1424
    // change to decimal later.
1425
2.54M
    if (*curCharPtr_ == '.') {
1426
1.25k
      ++curCharPtr_;
1427
1.25k
      goto fraction;
1428
1.25k
    }
1429
1430
2.53M
    if ((*curCharPtr_ | 32) == 'e') {
1431
92
      ++curCharPtr_;
1432
92
      goto exponent;
1433
92
    }
1434
2.53M
  }
1435
1436
2.53M
  goto end;
1437
1438
2.53M
fraction:
1439
  // We arrive here after we have consumed the decimal dot ".".
1440
  //
1441
1.43k
  real = true;
1442
2.52M
  while (isdigit(*curCharPtr_) || *curCharPtr_ == '_') {
1443
2.52M
    seenSeparator |= *curCharPtr_ == '_';
1444
2.52M
    ++curCharPtr_;
1445
2.52M
  }
1446
1447
1.43k
  if ((*curCharPtr_ | 32) == 'e') {
1448
6
    ++curCharPtr_;
1449
6
    goto exponent;
1450
1.43k
  } else {
1451
1.43k
    goto end;
1452
1.43k
  }
1453
1454
386
exponent:
1455
  // We arrive here after we have consumed the exponent character 'e' or 'E'.
1456
  //
1457
386
  real = true;
1458
386
  if (*curCharPtr_ == '+' || *curCharPtr_ == '-')
1459
34
    ++curCharPtr_;
1460
386
  if (isdigit(*curCharPtr_)) {
1461
636k
    do {
1462
636k
      seenSeparator |= *curCharPtr_ == '_';
1463
636k
      ++curCharPtr_;
1464
636k
    } while (isdigit(*curCharPtr_) || *curCharPtr_ == '_');
1465
386
  } else {
1466
0
    ok = false;
1467
0
  }
1468
1469
2.54M
end:
1470
  // We arrive here after we have consumed all we can from the number. Now,
1471
  // as per the spec, we consume a sequence of identifier characters if they
1472
  // follow directly, which means the number is invalid if it's not BigInt.
1473
2.54M
  if (consumeIdentifierStart()) {
1474
11
    consumeIdentifierParts<IdentifierMode::JS>();
1475
1476
11
    llvh::StringRef raw{rawStart, (size_t)(curCharPtr_ - rawStart)};
1477
11
    if (ok && !real && (!legacyOctal || raw == "0n") && tmpStorage_ == "n") {
1478
8
      assert(curCharPtr_ > start && "Must consume at least the trailing n.");
1479
0
      llvh::ArrayRef<char> digits{start, curCharPtr_ - 1};
1480
      // Use parseIntWithRadixDigits to validate the bigint literal's digits.
1481
      // The digits themselves can be ignored, since we're only interested in
1482
      // whether the string was parsed correctly.
1483
8
      if (digits.size() &&
1484
8
          parseIntWithRadixDigits</* AllowNumericSeparator */ true>(
1485
79
              digits, radix, [](uint8_t) {})) {
1486
        // This is a BigInt.
1487
8
        rawStorage_.clear();
1488
8
        rawStorage_.append(raw);
1489
8
        token_.setBigIntLiteral(getStringLiteral(rawStorage_));
1490
8
        return;
1491
8
      }
1492
1493
      // This is a BigInt with invalid digits; fail.
1494
8
    }
1495
1496
3
    ok = false;
1497
3
  }
1498
1499
2.54M
  double val;
1500
1501
  /// ES6.0 B.1.1
1502
  /// If we encounter a "legacy" octal number (starting with a '0') but if
1503
  /// the integer contains '8' or '9' we interpret it as decimal.
1504
2.54M
  const auto updateLegacyOctalRadix =
1505
2.54M
      [this, &radix, start, &legacyOctal]() -> void {
1506
1.31M
    assert(
1507
1.31M
        legacyOctal &&
1508
1.31M
        "updateLegacyOctalRadix can only be called in legacyOctal mode");
1509
0
    (void)legacyOctal;
1510
5.38M
    for (auto *scanPtr = start; scanPtr != curCharPtr_; ++scanPtr) {
1511
4.07M
      if (*scanPtr == '.' || *scanPtr == 'e') {
1512
0
        break;
1513
0
      }
1514
4.07M
      if (LLVM_UNLIKELY(*scanPtr >= '8') && LLVM_LIKELY(*scanPtr != '_')) {
1515
136
        sm_.warning(
1516
136
            SMRange(token_.getStartLoc(), SMLoc::getFromPointer(curCharPtr_)),
1517
136
            "Numeric literal starts with 0 but contains an 8 or 9 digit. "
1518
136
            "Interpreting as decimal (not octal).");
1519
136
        radix = 10;
1520
136
        break;
1521
136
      }
1522
4.07M
    }
1523
1.31M
  };
1524
1525
2.54M
  if (!ok) {
1526
3
    errorRange(token_.getStartLoc(), "invalid numeric literal");
1527
3
    val = std::numeric_limits<double>::quiet_NaN();
1528
2.54M
  } else if (
1529
2.54M
      !real && radix == 10 && curCharPtr_ - start <= 9 &&
1530
2.54M
      LLVM_LIKELY(!seenSeparator)) {
1531
    // If this is a decimal integer of at most 9 digits (log10(2**31-1), it
1532
    // can fit in a 32-bit integer. Use a faster conversion.
1533
1.19M
    int32_t ival = *start - '0';
1534
1.41M
    while (++start != curCharPtr_)
1535
224k
      ival = ival * 10 + (*start - '0');
1536
1.19M
    val = ival;
1537
1.34M
  } else if (real || radix == 10) {
1538
33.6k
    if (legacyOctal) {
1539
0
      if (strictMode_ || grammarContext == GrammarContext::Type) {
1540
0
        if (!errorRange(
1541
0
                token_.getStartLoc(),
1542
0
                "Decimals with leading zeros are not allowed in strict mode")) {
1543
0
          val = std::numeric_limits<double>::quiet_NaN();
1544
0
          goto done;
1545
0
        }
1546
0
      } else {
1547
        // Check to see if we can actually scan this as radix 10.
1548
        // Non-integer numbers must be in base 10, otherwise we error.
1549
0
        updateLegacyOctalRadix();
1550
0
        if (LLVM_LIKELY(radix != 10)) {
1551
0
          if (!errorRange(
1552
0
                  token_.getStartLoc(),
1553
0
                  "Octal numeric literals must be integers")) {
1554
0
            val = std::numeric_limits<double>::quiet_NaN();
1555
0
            goto done;
1556
0
          }
1557
0
        }
1558
0
      }
1559
0
    }
1560
1561
    // We need a zero-terminated buffer for hermes_g_strtod().
1562
33.6k
    llvh::SmallString<32> buf;
1563
33.6k
    buf.reserve(curCharPtr_ - start + 1);
1564
33.6k
    if (LLVM_UNLIKELY(seenSeparator)) {
1565
0
      for (const char *it = start; it != curCharPtr_; ++it) {
1566
0
        if (LLVM_LIKELY(*it != '_')) {
1567
0
          buf.push_back(*it);
1568
0
        } else {
1569
          // Check to ensure that '_' is surrounded by digits.
1570
          // This is safe because the source buffer is zero-terminated and
1571
          // we know that the numeric literal didn't start with '_'.
1572
          // Note that we could have a 0b_11 literal, but we'd still fail
1573
          // properly because of the radix==16 check.
1574
0
          char prev = *(it - 1);
1575
0
          char next = *(it + 1);
1576
0
          if (!isdigit(prev) &&
1577
0
              !(radix == 16 && 'a' <= (prev | 32) && (prev | 32) <= 'f')) {
1578
0
            errorRange(
1579
0
                token_.getStartLoc(),
1580
0
                "numeric separator must come after a digit");
1581
0
          } else if (
1582
0
              !isdigit(next) &&
1583
0
              !(radix == 16 && 'a' <= (next | 32) && (next | 32) <= 'f')) {
1584
0
            errorRange(
1585
0
                token_.getStartLoc(),
1586
0
                "numeric separator must come before a digit");
1587
0
          }
1588
0
        }
1589
0
      }
1590
33.6k
    } else {
1591
33.6k
      buf.append(start, curCharPtr_);
1592
33.6k
    }
1593
33.6k
    buf.push_back(0);
1594
33.6k
    char *endPtr;
1595
33.6k
    val = ::hermes_g_strtod(buf.data(), &endPtr);
1596
33.6k
    if (endPtr != &buf.back()) {
1597
0
      errorRange(token_.getStartLoc(), "invalid numeric literal");
1598
0
      val = std::numeric_limits<double>::quiet_NaN();
1599
0
    }
1600
1.31M
  } else {
1601
1.31M
    if (legacyOctal &&
1602
1.31M
        (strictMode_ || grammarContext == GrammarContext::Type) &&
1603
1.31M
        curCharPtr_ - start > 1) {
1604
0
      if (!errorRange(
1605
0
              token_.getStartLoc(),
1606
0
              "Octal literals must use '0o' in strict mode")) {
1607
0
        val = std::numeric_limits<double>::quiet_NaN();
1608
0
        goto done;
1609
0
      }
1610
0
    }
1611
1612
    // Handle the zero-radix case. This could only happen with radix 16
1613
    // because otherwise start wouldn't have been changed.
1614
1.31M
    if (curCharPtr_ == start) {
1615
0
      errorRange(
1616
0
          token_.getStartLoc(),
1617
0
          llvh::Twine("No digits after ") + llvh::StringRef(start - 2, 2));
1618
0
      val = std::numeric_limits<double>::quiet_NaN();
1619
1.31M
    } else {
1620
      // Parse the rest of the number:
1621
1.31M
      if (legacyOctal) {
1622
1.31M
        updateLegacyOctalRadix();
1623
        // LegacyOctalLikeDecimalIntegerLiteral cannot contain separators.
1624
1.31M
        if (LLVM_UNLIKELY(seenSeparator)) {
1625
0
          errorRange(
1626
0
              token_.getStartLoc(),
1627
0
              "Numeric separator cannot be used in literal after leading 0");
1628
0
        }
1629
1.31M
      }
1630
1.31M
      auto parsedInt = parseIntWithRadix</* AllowNumericSeparator */ true>(
1631
1.31M
          llvh::ArrayRef<char>{start, (size_t)(curCharPtr_ - start)}, radix);
1632
1.31M
      if (!parsedInt) {
1633
0
        errorRange(token_.getStartLoc(), "invalid integer literal");
1634
0
        val = std::numeric_limits<double>::quiet_NaN();
1635
1.31M
      } else {
1636
1.31M
        val = parsedInt.getValue();
1637
1.31M
      }
1638
1.31M
    }
1639
1.31M
  }
1640
1641
2.54M
done:
1642
2.54M
  token_.setNumericLiteral(val);
1643
2.54M
}
1644
1645
1.28M
static TokenKind matchReservedWord(const char *str, unsigned len) {
1646
1.28M
  return llvh::StringSwitch<TokenKind>(llvh::StringRef(str, len))
1647
56.7M
#define RESWORD(name) .Case(#name, TokenKind::rw_##name)
1648
1.28M
#include "hermes/Parser/TokenKinds.def"
1649
1.28M
      .Default(TokenKind::identifier);
1650
1.28M
}
1651
1652
1.28M
TokenKind JSLexer::scanReservedWord(const char *start, unsigned length) {
1653
1.28M
  TokenKind rw = matchReservedWord(start, length);
1654
1655
  // Check for "Future reserved words" which should not be recognised in non-
1656
  // strict mode.
1657
1.28M
  if (!strictMode_ && rw != TokenKind::identifier) {
1658
20.7k
    switch (rw) {
1659
0
      case TokenKind::rw_implements:
1660
0
      case TokenKind::rw_interface:
1661
0
      case TokenKind::rw_package:
1662
12
      case TokenKind::rw_private:
1663
12
      case TokenKind::rw_protected:
1664
12
      case TokenKind::rw_public:
1665
12
      case TokenKind::rw_static:
1666
12
      case TokenKind::rw_yield:
1667
12
        rw = TokenKind::identifier;
1668
20.7k
      default:
1669
20.7k
        break;
1670
20.7k
    }
1671
20.7k
  }
1672
1.28M
  return rw;
1673
1.28M
}
1674
1675
template <JSLexer::IdentifierMode Mode>
1676
1.28M
void JSLexer::scanIdentifierFastPath(const char *start) {
1677
1.28M
  const char *end = start;
1678
1679
  // Quickly consume the ASCII identifier part.
1680
1.28M
  char ch;
1681
1.28M
  do
1682
10.2M
    ch = (unsigned char)*++end;
1683
10.2M
  while (ch == '_' || ch == '$' || ((ch | 32) >= 'a' && (ch | 32) <= 'z') ||
1684
10.2M
         (ch >= '0' && ch <= '9') ||
1685
10.2M
         (Mode == IdentifierMode::JSX && ch == '-') ||
1686
10.2M
         (Mode == IdentifierMode::Flow && ch == '@'));
1687
1688
  // Check whether a slow part of the identifier follows.
1689
1.28M
  if (LLVM_UNLIKELY(ch == '\\')) {
1690
    // An escape. Pass the baton to the slow path.
1691
0
    initStorageWith(start, end);
1692
0
    curCharPtr_ = end;
1693
0
    scanIdentifierParts<Mode>();
1694
0
    return;
1695
1.28M
  } else if (LLVM_UNLIKELY(isUTF8Start(ch))) {
1696
    // If we have encountered a Unicode character, we try to decode it. If it
1697
    // can be a part of the identifier,
1698
    // we consume it, otherwise we leave it alone.
1699
2.18k
    auto decoded = _peekUTF8(end);
1700
2.18k
    if (isUnicodeIDContinue(decoded.first)) {
1701
2.13k
      initStorageWith(start, end);
1702
2.13k
      appendUnicodeToStorage(decoded.first);
1703
2.13k
      curCharPtr_ = decoded.second;
1704
2.13k
      scanIdentifierParts<Mode>();
1705
2.13k
      return;
1706
2.13k
    }
1707
2.18k
  }
1708
1709
1.28M
  curCharPtr_ = end;
1710
1711
1.28M
  size_t length = end - start;
1712
1713
1.28M
  auto rw = scanReservedWord(start, (unsigned)length);
1714
1.28M
  if (rw != TokenKind::identifier) {
1715
20.7k
    token_.setResWord(rw, resWordIdent(rw));
1716
1.26M
  } else {
1717
1.26M
    token_.setIdentifier(getIdentifier(llvh::StringRef(start, length)));
1718
1.26M
  }
1719
1.28M
}
Unexecuted instantiation: void hermes::parser::JSLexer::scanIdentifierFastPath<(hermes::parser::JSLexer::IdentifierMode)1>(char const*)
Unexecuted instantiation: void hermes::parser::JSLexer::scanIdentifierFastPath<(hermes::parser::JSLexer::IdentifierMode)2>(char const*)
void hermes::parser::JSLexer::scanIdentifierFastPath<(hermes::parser::JSLexer::IdentifierMode)0>(char const*)
Line
Count
Source
1676
1.28M
void JSLexer::scanIdentifierFastPath(const char *start) {
1677
1.28M
  const char *end = start;
1678
1679
  // Quickly consume the ASCII identifier part.
1680
1.28M
  char ch;
1681
1.28M
  do
1682
10.2M
    ch = (unsigned char)*++end;
1683
10.2M
  while (ch == '_' || ch == '$' || ((ch | 32) >= 'a' && (ch | 32) <= 'z') ||
1684
10.2M
         (ch >= '0' && ch <= '9') ||
1685
10.2M
         (Mode == IdentifierMode::JSX && ch == '-') ||
1686
10.2M
         (Mode == IdentifierMode::Flow && ch == '@'));
1687
1688
  // Check whether a slow part of the identifier follows.
1689
1.28M
  if (LLVM_UNLIKELY(ch == '\\')) {
1690
    // An escape. Pass the baton to the slow path.
1691
0
    initStorageWith(start, end);
1692
0
    curCharPtr_ = end;
1693
0
    scanIdentifierParts<Mode>();
1694
0
    return;
1695
1.28M
  } else if (LLVM_UNLIKELY(isUTF8Start(ch))) {
1696
    // If we have encountered a Unicode character, we try to decode it. If it
1697
    // can be a part of the identifier,
1698
    // we consume it, otherwise we leave it alone.
1699
2.18k
    auto decoded = _peekUTF8(end);
1700
2.18k
    if (isUnicodeIDContinue(decoded.first)) {
1701
2.13k
      initStorageWith(start, end);
1702
2.13k
      appendUnicodeToStorage(decoded.first);
1703
2.13k
      curCharPtr_ = decoded.second;
1704
2.13k
      scanIdentifierParts<Mode>();
1705
2.13k
      return;
1706
2.13k
    }
1707
2.18k
  }
1708
1709
1.28M
  curCharPtr_ = end;
1710
1711
1.28M
  size_t length = end - start;
1712
1713
1.28M
  auto rw = scanReservedWord(start, (unsigned)length);
1714
1.28M
  if (rw != TokenKind::identifier) {
1715
20.7k
    token_.setResWord(rw, resWordIdent(rw));
1716
1.26M
  } else {
1717
1.26M
    token_.setIdentifier(getIdentifier(llvh::StringRef(start, length)));
1718
1.26M
  }
1719
1.28M
}
1720
1721
template <JSLexer::IdentifierMode Mode>
1722
2.13k
void JSLexer::scanIdentifierParts() {
1723
2.13k
  consumeIdentifierParts<Mode>();
1724
2.13k
  auto rw =
1725
2.13k
      scanReservedWord(tmpStorage_.str().begin(), tmpStorage_.str().size());
1726
2.13k
  if (rw != TokenKind::identifier) {
1727
0
    token_.setResWord(rw, resWordIdent(rw));
1728
0
    sm_.warning(
1729
0
        {token_.getStartLoc(), SMLoc::getFromPointer(curCharPtr_)},
1730
0
        "scanning identifier with unicode escape as reserved word",
1731
0
        Subsystem::Lexer);
1732
2.13k
  } else {
1733
2.13k
    token_.setIdentifier(getIdentifier(tmpStorage_.str()));
1734
2.13k
  }
1735
2.13k
}
Unexecuted instantiation: void hermes::parser::JSLexer::scanIdentifierParts<(hermes::parser::JSLexer::IdentifierMode)1>()
Unexecuted instantiation: void hermes::parser::JSLexer::scanIdentifierParts<(hermes::parser::JSLexer::IdentifierMode)2>()
void hermes::parser::JSLexer::scanIdentifierParts<(hermes::parser::JSLexer::IdentifierMode)0>()
Line
Count
Source
1722
2.13k
void JSLexer::scanIdentifierParts() {
1723
2.13k
  consumeIdentifierParts<Mode>();
1724
2.13k
  auto rw =
1725
2.13k
      scanReservedWord(tmpStorage_.str().begin(), tmpStorage_.str().size());
1726
2.13k
  if (rw != TokenKind::identifier) {
1727
0
    token_.setResWord(rw, resWordIdent(rw));
1728
0
    sm_.warning(
1729
0
        {token_.getStartLoc(), SMLoc::getFromPointer(curCharPtr_)},
1730
0
        "scanning identifier with unicode escape as reserved word",
1731
0
        Subsystem::Lexer);
1732
2.13k
  } else {
1733
2.13k
    token_.setIdentifier(getIdentifier(tmpStorage_.str()));
1734
2.13k
  }
1735
2.13k
}
1736
1737
0
bool JSLexer::scanPrivateIdentifier() {
1738
0
  assert(*curCharPtr_ == '#');
1739
1740
  // Skip the '#'.
1741
0
  const char *start = curCharPtr_;
1742
0
  ++curCharPtr_;
1743
1744
  // Scan the actual identifier.
1745
0
  if (LLVM_LIKELY(isASCIIIdentifierStart(*curCharPtr_))) {
1746
0
    scanIdentifierFastPath<IdentifierMode::JS>(curCharPtr_);
1747
0
  } else if (consumeIdentifierStart()) {
1748
    // curCharPtr_ has been updated by consumeIdentifierStart.
1749
0
    scanIdentifierParts<IdentifierMode::JS>();
1750
0
  } else {
1751
0
    error(SMLoc::getFromPointer(start), "empty private identifier");
1752
0
    return false;
1753
0
  }
1754
1755
  // Parsed a resword or identifier.
1756
  // Convert the TokenKind to private_identifier after the fact.
1757
  // This avoids adding another Mode to IdentifierMode.
1758
0
  token_.setPrivateIdentifier(token_.getResWordOrIdentifier());
1759
1760
0
  return true;
1761
0
}
1762
1763
template <bool JSX>
1764
45
void JSLexer::scanString() {
1765
45
  assert(*curCharPtr_ == '\'' || *curCharPtr_ == '"');
1766
0
  char quoteCh = *curCharPtr_++;
1767
1768
  // Track whether we encounter any escapes or new line continuations. We need
1769
  // that information in order to detect directives.
1770
45
  bool escapes = false;
1771
1772
45
  tmpStorage_.clear();
1773
1774
6.06M
  for (;;) {
1775
6.06M
    if (*curCharPtr_ == quoteCh) {
1776
44
      ++curCharPtr_;
1777
44
      break;
1778
6.06M
    } else if (!JSX && *curCharPtr_ == '\\') {
1779
12
      escapes = true;
1780
12
      ++curCharPtr_;
1781
12
      switch ((unsigned char)*curCharPtr_) {
1782
0
        case '\'':
1783
0
        case '"':
1784
0
        case '\\':
1785
0
          tmpStorage_.push_back((unsigned char)*curCharPtr_++);
1786
0
          break;
1787
1788
0
        case 'b':
1789
0
          ++curCharPtr_;
1790
0
          tmpStorage_.push_back(8);
1791
0
          break;
1792
0
        case 'f':
1793
0
          ++curCharPtr_;
1794
0
          tmpStorage_.push_back(12);
1795
0
          break;
1796
0
        case 'n':
1797
0
          ++curCharPtr_;
1798
0
          tmpStorage_.push_back(10);
1799
0
          break;
1800
0
        case 'r':
1801
0
          ++curCharPtr_;
1802
0
          tmpStorage_.push_back(13);
1803
0
          break;
1804
0
        case 't':
1805
0
          ++curCharPtr_;
1806
0
          tmpStorage_.push_back(9);
1807
0
          break;
1808
2
        case 'v':
1809
2
          ++curCharPtr_;
1810
2
          tmpStorage_.push_back(11);
1811
2
          break;
1812
1813
0
        case '\0': // EOF?
1814
0
          if (curCharPtr_ == bufferEnd_) { // eof?
1815
0
            error(SMLoc::getFromPointer(curCharPtr_), "non-terminated string");
1816
0
            sm_.note(token_.getStartLoc(), "string started here");
1817
0
            goto breakLoop;
1818
0
          } else {
1819
0
            tmpStorage_.push_back((unsigned char)*curCharPtr_++);
1820
0
          }
1821
0
          break;
1822
1823
0
        case '0':
1824
          // '\0' is not an octal so handle it separately.
1825
0
          if (!(curCharPtr_[1] >= '0' && curCharPtr_[1] <= '7')) {
1826
0
            ++curCharPtr_;
1827
0
            appendUnicodeToStorage(0);
1828
0
            break;
1829
0
          }
1830
0
          [[fallthrough]];
1831
0
        case '1':
1832
0
        case '2':
1833
0
        case '3':
1834
0
          appendUnicodeToStorage(consumeOctal(3));
1835
0
          break;
1836
0
        case '4':
1837
0
        case '5':
1838
0
        case '6':
1839
2
        case '7':
1840
2
          appendUnicodeToStorage(consumeOctal(2));
1841
2
          break;
1842
1843
2
        case 'x': {
1844
2
          ++curCharPtr_;
1845
2
          auto v = consumeHex(2);
1846
2
          appendUnicodeToStorage(v ? *v : 0);
1847
2
          break;
1848
0
        }
1849
1850
0
        case 'u':
1851
0
          --curCharPtr_;
1852
0
          appendUnicodeToStorage(consumeUnicodeEscape());
1853
0
          break;
1854
1855
        // Escaped line terminator. We just need to skip it.
1856
0
        case '\n':
1857
0
          ++curCharPtr_;
1858
0
          break;
1859
0
        case '\r':
1860
0
          ++curCharPtr_;
1861
0
          if (*curCharPtr_ == '\n') // skip CR LF
1862
0
            ++curCharPtr_;
1863
0
          break;
1864
0
        case UTF8_LINE_TERMINATOR_CHAR0:
1865
0
          if (matchUnicodeLineTerminatorOffset1(curCharPtr_)) {
1866
0
            curCharPtr_ += 3;
1867
0
            break;
1868
0
          }
1869
0
          appendUnicodeToStorage(_decodeUTF8SlowPath(curCharPtr_));
1870
0
          break;
1871
1872
6
        default:
1873
6
          if (LLVM_UNLIKELY(isUTF8Start(*curCharPtr_)))
1874
0
            appendUnicodeToStorage(_decodeUTF8SlowPath(curCharPtr_));
1875
6
          else
1876
6
            tmpStorage_.push_back((unsigned char)*curCharPtr_++);
1877
6
          break;
1878
12
      }
1879
6.06M
    } else if (LLVM_UNLIKELY(*curCharPtr_ == '\n' || *curCharPtr_ == '\r')) {
1880
0
      if (JSX) {
1881
0
        tmpStorage_.push_back(*curCharPtr_++);
1882
0
      } else {
1883
0
        error(SMLoc::getFromPointer(curCharPtr_), "non-terminated string");
1884
0
        sm_.note(token_.getStartLoc(), "string started here");
1885
0
        break;
1886
0
      }
1887
0
#if HERMES_PARSE_JSX
1888
6.06M
    } else if (LLVM_UNLIKELY(JSX && *curCharPtr_ == '&')) {
1889
0
      auto codePoint = consumeHTMLEntityOptional();
1890
0
      if (codePoint.hasValue()) {
1891
0
        appendUnicodeToStorage(*codePoint);
1892
0
      } else {
1893
0
        tmpStorage_.push_back(*curCharPtr_++);
1894
0
      }
1895
0
#endif
1896
6.06M
    } else if (LLVM_UNLIKELY(*curCharPtr_ == 0 && curCharPtr_ == bufferEnd_)) {
1897
1
      error(SMLoc::getFromPointer(curCharPtr_), "non-terminated string");
1898
1
      sm_.note(token_.getStartLoc(), "string started here");
1899
1
      break;
1900
6.06M
    } else {
1901
6.06M
      if (LLVM_UNLIKELY(isUTF8Start(*curCharPtr_))) {
1902
        // Decode and re-encode the character and append it to the string
1903
        // storage
1904
0
        appendUnicodeToStorage(_decodeUTF8SlowPath(curCharPtr_));
1905
6.06M
      } else {
1906
6.06M
        tmpStorage_.push_back(*curCharPtr_++);
1907
6.06M
      }
1908
6.06M
    }
1909
6.06M
  }
1910
45
breakLoop:
1911
45
  token_.setStringLiteral(getStringLiteral(tmpStorage_.str()), escapes);
1912
45
}
Unexecuted instantiation: void hermes::parser::JSLexer::scanString<true>()
void hermes::parser::JSLexer::scanString<false>()
Line
Count
Source
1764
45
void JSLexer::scanString() {
1765
45
  assert(*curCharPtr_ == '\'' || *curCharPtr_ == '"');
1766
0
  char quoteCh = *curCharPtr_++;
1767
1768
  // Track whether we encounter any escapes or new line continuations. We need
1769
  // that information in order to detect directives.
1770
45
  bool escapes = false;
1771
1772
45
  tmpStorage_.clear();
1773
1774
6.06M
  for (;;) {
1775
6.06M
    if (*curCharPtr_ == quoteCh) {
1776
44
      ++curCharPtr_;
1777
44
      break;
1778
6.06M
    } else if (!JSX && *curCharPtr_ == '\\') {
1779
12
      escapes = true;
1780
12
      ++curCharPtr_;
1781
12
      switch ((unsigned char)*curCharPtr_) {
1782
0
        case '\'':
1783
0
        case '"':
1784
0
        case '\\':
1785
0
          tmpStorage_.push_back((unsigned char)*curCharPtr_++);
1786
0
          break;
1787
1788
0
        case 'b':
1789
0
          ++curCharPtr_;
1790
0
          tmpStorage_.push_back(8);
1791
0
          break;
1792
0
        case 'f':
1793
0
          ++curCharPtr_;
1794
0
          tmpStorage_.push_back(12);
1795
0
          break;
1796
0
        case 'n':
1797
0
          ++curCharPtr_;
1798
0
          tmpStorage_.push_back(10);
1799
0
          break;
1800
0
        case 'r':
1801
0
          ++curCharPtr_;
1802
0
          tmpStorage_.push_back(13);
1803
0
          break;
1804
0
        case 't':
1805
0
          ++curCharPtr_;
1806
0
          tmpStorage_.push_back(9);
1807
0
          break;
1808
2
        case 'v':
1809
2
          ++curCharPtr_;
1810
2
          tmpStorage_.push_back(11);
1811
2
          break;
1812
1813
0
        case '\0': // EOF?
1814
0
          if (curCharPtr_ == bufferEnd_) { // eof?
1815
0
            error(SMLoc::getFromPointer(curCharPtr_), "non-terminated string");
1816
0
            sm_.note(token_.getStartLoc(), "string started here");
1817
0
            goto breakLoop;
1818
0
          } else {
1819
0
            tmpStorage_.push_back((unsigned char)*curCharPtr_++);
1820
0
          }
1821
0
          break;
1822
1823
0
        case '0':
1824
          // '\0' is not an octal so handle it separately.
1825
0
          if (!(curCharPtr_[1] >= '0' && curCharPtr_[1] <= '7')) {
1826
0
            ++curCharPtr_;
1827
0
            appendUnicodeToStorage(0);
1828
0
            break;
1829
0
          }
1830
0
          [[fallthrough]];
1831
0
        case '1':
1832
0
        case '2':
1833
0
        case '3':
1834
0
          appendUnicodeToStorage(consumeOctal(3));
1835
0
          break;
1836
0
        case '4':
1837
0
        case '5':
1838
0
        case '6':
1839
2
        case '7':
1840
2
          appendUnicodeToStorage(consumeOctal(2));
1841
2
          break;
1842
1843
2
        case 'x': {
1844
2
          ++curCharPtr_;
1845
2
          auto v = consumeHex(2);
1846
2
          appendUnicodeToStorage(v ? *v : 0);
1847
2
          break;
1848
0
        }
1849
1850
0
        case 'u':
1851
0
          --curCharPtr_;
1852
0
          appendUnicodeToStorage(consumeUnicodeEscape());
1853
0
          break;
1854
1855
        // Escaped line terminator. We just need to skip it.
1856
0
        case '\n':
1857
0
          ++curCharPtr_;
1858
0
          break;
1859
0
        case '\r':
1860
0
          ++curCharPtr_;
1861
0
          if (*curCharPtr_ == '\n') // skip CR LF
1862
0
            ++curCharPtr_;
1863
0
          break;
1864
0
        case UTF8_LINE_TERMINATOR_CHAR0:
1865
0
          if (matchUnicodeLineTerminatorOffset1(curCharPtr_)) {
1866
0
            curCharPtr_ += 3;
1867
0
            break;
1868
0
          }
1869
0
          appendUnicodeToStorage(_decodeUTF8SlowPath(curCharPtr_));
1870
0
          break;
1871
1872
6
        default:
1873
6
          if (LLVM_UNLIKELY(isUTF8Start(*curCharPtr_)))
1874
0
            appendUnicodeToStorage(_decodeUTF8SlowPath(curCharPtr_));
1875
6
          else
1876
6
            tmpStorage_.push_back((unsigned char)*curCharPtr_++);
1877
6
          break;
1878
12
      }
1879
6.06M
    } else if (LLVM_UNLIKELY(*curCharPtr_ == '\n' || *curCharPtr_ == '\r')) {
1880
0
      if (JSX) {
1881
0
        tmpStorage_.push_back(*curCharPtr_++);
1882
0
      } else {
1883
0
        error(SMLoc::getFromPointer(curCharPtr_), "non-terminated string");
1884
0
        sm_.note(token_.getStartLoc(), "string started here");
1885
0
        break;
1886
0
      }
1887
0
#if HERMES_PARSE_JSX
1888
6.06M
    } else if (LLVM_UNLIKELY(JSX && *curCharPtr_ == '&')) {
1889
0
      auto codePoint = consumeHTMLEntityOptional();
1890
0
      if (codePoint.hasValue()) {
1891
0
        appendUnicodeToStorage(*codePoint);
1892
0
      } else {
1893
0
        tmpStorage_.push_back(*curCharPtr_++);
1894
0
      }
1895
0
#endif
1896
6.06M
    } else if (LLVM_UNLIKELY(*curCharPtr_ == 0 && curCharPtr_ == bufferEnd_)) {
1897
1
      error(SMLoc::getFromPointer(curCharPtr_), "non-terminated string");
1898
1
      sm_.note(token_.getStartLoc(), "string started here");
1899
1
      break;
1900
6.06M
    } else {
1901
6.06M
      if (LLVM_UNLIKELY(isUTF8Start(*curCharPtr_))) {
1902
        // Decode and re-encode the character and append it to the string
1903
        // storage
1904
0
        appendUnicodeToStorage(_decodeUTF8SlowPath(curCharPtr_));
1905
6.06M
      } else {
1906
6.06M
        tmpStorage_.push_back(*curCharPtr_++);
1907
6.06M
      }
1908
6.06M
    }
1909
6.06M
  }
1910
45
breakLoop:
1911
45
  token_.setStringLiteral(getStringLiteral(tmpStorage_.str()), escapes);
1912
45
}
1913
1914
701k
void JSLexer::scanTemplateLiteral() {
1915
701k
  assert(*curCharPtr_ == '`' || *curCharPtr_ == '}');
1916
1917
  // Whether the token will result in TemplateHead upon encountering ${.
1918
  // If we end the literal with `, then the result is NoSubstitutionTemplate,
1919
  // so this will be ignored.
1920
0
  bool isHead = *curCharPtr_ == '`';
1921
1922
  // If the token ended with a ` then it's a tail (or NoSubstitutionTemplate),
1923
  // and if it ended with a ${ then it's not a tail.
1924
701k
  bool isTail = false;
1925
1926
  // Advance past the initial `.
1927
701k
  ++curCharPtr_;
1928
1929
  // Track whether we encounter any NotEscapeSequence instances,
1930
  // which will be used to error out on non-tagged sequences.
1931
701k
  bool foundNotEscapeSequence = false;
1932
1933
  // Store the Template Value (TV) in the tmpStorage_.
1934
701k
  tmpStorage_.clear();
1935
1936
  // Store the Template Raw Value (TRV) in the rawStorage_.
1937
701k
  rawStorage_.clear();
1938
1939
  /// Return the Template Raw Value (TRV) of character \p c.
1940
  /// The only time the TRV is different from c is when c is a <CR>.
1941
  /// In that case, this function will return 0x0a (LINE FEED).
1942
701k
  const auto trv = [](char c) -> char {
1943
1.17k
    if (c == '\r') {
1944
      // This case takes \r and \r\n into account.
1945
      // The code below which consumes line separators will skip the following
1946
      // \n if there is a \r\n.
1947
      // For the purposes of finding the TRV it doesn't matter.
1948
116
      return 0x0a;
1949
116
    }
1950
1.05k
    return c;
1951
1.17k
  };
1952
1953
4.20M
  for (;;) {
1954
4.20M
    if (*curCharPtr_ == '`') {
1955
526k
      isTail = true;
1956
526k
      ++curCharPtr_;
1957
526k
      break;
1958
3.67M
    } else if (*curCharPtr_ == '$' && curCharPtr_[1] == '{') {
1959
      // End of the TemplateCharacters.
1960
174k
      isTail = false;
1961
174k
      curCharPtr_ += 2;
1962
174k
      break;
1963
3.50M
    } else if (*curCharPtr_ == '\\') {
1964
1.05k
      rawStorage_.push_back(*curCharPtr_);
1965
1.05k
      ++curCharPtr_;
1966
1.05k
      rawStorage_.push_back(trv(*curCharPtr_));
1967
1.05k
      switch ((unsigned char)*curCharPtr_) {
1968
0
        case '\'':
1969
0
        case '"':
1970
602
        case '\\':
1971
602
          tmpStorage_.push_back((unsigned char)*curCharPtr_++);
1972
602
          break;
1973
1974
0
        case 'b':
1975
0
          ++curCharPtr_;
1976
0
          tmpStorage_.push_back(8);
1977
0
          break;
1978
0
        case 'f':
1979
0
          ++curCharPtr_;
1980
0
          tmpStorage_.push_back(12);
1981
0
          break;
1982
0
        case 'n':
1983
0
          ++curCharPtr_;
1984
0
          tmpStorage_.push_back(10);
1985
0
          break;
1986
2
        case 'r':
1987
2
          ++curCharPtr_;
1988
2
          tmpStorage_.push_back(13);
1989
2
          break;
1990
0
        case 't':
1991
0
          ++curCharPtr_;
1992
0
          tmpStorage_.push_back(9);
1993
0
          break;
1994
0
        case 'v':
1995
0
          ++curCharPtr_;
1996
0
          tmpStorage_.push_back(11);
1997
0
          break;
1998
1999
0
        case '\0': // EOF?
2000
0
          if (curCharPtr_ == bufferEnd_) { // eof?
2001
0
            error(
2002
0
                SMLoc::getFromPointer(curCharPtr_),
2003
0
                "non-terminated template literal");
2004
0
            sm_.note(token_.getStartLoc(), "template literal started here");
2005
0
            goto breakLoop;
2006
0
          } else {
2007
0
            tmpStorage_.push_back((unsigned char)*curCharPtr_++);
2008
0
          }
2009
0
          break;
2010
2011
2
        case '0':
2012
          // '\0' is only a valid escape sequence if not followed by a
2013
          // DecimalDigit.
2014
2
          if (!(curCharPtr_[1] >= '0' && curCharPtr_[1] <= '9')) {
2015
0
            ++curCharPtr_;
2016
0
            appendUnicodeToStorage(0);
2017
0
            break;
2018
0
          }
2019
2
          [[fallthrough]];
2020
2021
4
        case '1':
2022
4
        case '2':
2023
4
        case '3':
2024
4
        case '4':
2025
4
        case '5':
2026
4
        case '6':
2027
4
        case '7':
2028
6
        case '8':
2029
6
        case '9':
2030
          // NotEscapeSequence :: DecimalDigit but not 0
2031
          // NotEscapeSequence :: 0 DecimalDigit
2032
          // Octal numbers are not supported in template strings,
2033
          // so leave the number in the raw storage (done above) and move on.
2034
6
          ++curCharPtr_;
2035
6
          foundNotEscapeSequence = true;
2036
6
          break;
2037
2038
0
        case 'x': {
2039
0
          ++curCharPtr_;
2040
0
          const char *start = curCharPtr_;
2041
0
          auto v = consumeHex(2, false);
2042
0
          if (!v) {
2043
0
            foundNotEscapeSequence = true;
2044
0
          }
2045
0
          appendUnicodeToStorage(v ? *v : 0);
2046
0
          rawStorage_.append({start, (size_t)(curCharPtr_ - start)});
2047
0
          break;
2048
6
        }
2049
2050
192
        case 'u': {
2051
          // Pointer to the first character after the 'u', which is where we
2052
          // can continue scanning from if we fail to decode an escape.
2053
192
          const char *start = curCharPtr_ + 1;
2054
          // Reset the pointer to the '\' to scan the unicode escape.
2055
192
          --curCharPtr_;
2056
192
          assert(*curCharPtr_ == '\\' && "must have started with \\");
2057
0
          auto codepoint = consumeUnicodeEscapeOptional();
2058
192
          if (!codepoint) {
2059
92
            foundNotEscapeSequence = true;
2060
92
            curCharPtr_ = start;
2061
92
            break;
2062
92
          }
2063
100
          appendUnicodeToStorage(*codepoint);
2064
100
          rawStorage_.append({start, (size_t)(curCharPtr_ - start)});
2065
100
          break;
2066
192
        }
2067
2068
        // Escaped line terminator. We just need to skip it, because it was
2069
        // added to the raw storage at the start of the switch statement.
2070
2
        case '\n':
2071
2
          ++curCharPtr_;
2072
2
          break;
2073
0
        case '\r':
2074
0
          ++curCharPtr_;
2075
0
          if (*curCharPtr_ == '\n') // skip CR LF
2076
0
            ++curCharPtr_;
2077
0
          break;
2078
0
        case UTF8_LINE_TERMINATOR_CHAR0: {
2079
0
          bool isLineTerminator =
2080
0
              matchUnicodeLineTerminatorOffset1(curCharPtr_);
2081
0
          uint32_t codepoint = _decodeUTF8SlowPath(curCharPtr_);
2082
          // Needs to be added to the rawStorage_ regardless,
2083
          // but we first need to pop off the byte that was added prior to the
2084
          // switch statement.
2085
0
          rawStorage_.pop_back();
2086
0
          appendUnicodeToStorage(codepoint, rawStorage_);
2087
0
          if (!isLineTerminator) {
2088
            // Only add the codepoint to the tmpStorage if it wasn't a line
2089
            // terminator.
2090
0
            appendUnicodeToStorage(codepoint);
2091
0
          }
2092
0
          break;
2093
192
        }
2094
2095
251
        default:
2096
251
          if (LLVM_UNLIKELY(isUTF8Start(*curCharPtr_))) {
2097
0
            uint32_t codepoint = _decodeUTF8SlowPath(curCharPtr_);
2098
0
            appendUnicodeToStorage(codepoint);
2099
            // Remove the last byte from rawStorage_ and then append the
2100
            // unicode codepoint to it. The already inserted byte will change
2101
            // if this codepoint is in Supplementary Planes.
2102
0
            rawStorage_.pop_back();
2103
0
            appendUnicodeToStorage(codepoint, rawStorage_);
2104
251
          } else {
2105
            // The TV of EscapeSequence is the SV of EscapeSequence.
2106
251
            tmpStorage_.push_back((unsigned char)*curCharPtr_++);
2107
251
          }
2108
251
          break;
2109
1.05k
      }
2110
3.50M
    } else if (LLVM_UNLIKELY(*curCharPtr_ == 0 && curCharPtr_ == bufferEnd_)) {
2111
5
      error(
2112
5
          SMLoc::getFromPointer(curCharPtr_),
2113
5
          "non-terminated template literal");
2114
5
      sm_.note(token_.getStartLoc(), "template literal started here");
2115
5
      break;
2116
3.50M
    } else if (*curCharPtr_ == '\r') {
2117
      // The TV of LineTerminatorSequence is the TRV of
2118
      // LineTerminatorSequence. The only time this differs from the same
2119
      // characters as the bytes in the file is when the sequence begins with
2120
      // a <CR>.
2121
58
      tmpStorage_.push_back(trv(*curCharPtr_));
2122
58
      rawStorage_.push_back(trv(*curCharPtr_));
2123
58
      curCharPtr_++;
2124
58
      if (*curCharPtr_ == '\n') {
2125
        // Skip the <CR> <LF>
2126
0
        curCharPtr_++;
2127
0
      }
2128
3.50M
    } else {
2129
3.50M
      if (LLVM_UNLIKELY(isUTF8Start(*curCharPtr_))) {
2130
        // Decode and re-encode the character and append it to the string
2131
        // storage
2132
3.87k
        uint32_t codepoint = _decodeUTF8SlowPath(curCharPtr_);
2133
3.87k
        appendUnicodeToStorage(codepoint);
2134
3.87k
        appendUnicodeToStorage(codepoint, rawStorage_);
2135
3.50M
      } else {
2136
3.50M
        rawStorage_.push_back(*curCharPtr_);
2137
3.50M
        tmpStorage_.push_back(*curCharPtr_++);
2138
3.50M
      }
2139
3.50M
    }
2140
4.20M
  }
2141
701k
breakLoop:
2142
  // If the template literal is tagged and contains invalid escapes, then
2143
  // cooked should be null because there is no way to cook it, per the ESTree
2144
  // 2018 spec. The parser will error when encountering an untagged literal
2145
  // with invalid escapes, so we place nullptr here.
2146
701k
  UniqueString *cookedStr =
2147
701k
      foundNotEscapeSequence ? nullptr : getStringLiteral(tmpStorage_.str());
2148
701k
  UniqueString *rawStr = getStringLiteral(rawStorage_.str());
2149
701k
  if (isHead) {
2150
526k
    if (isTail) {
2151
      // ` characters `
2152
526k
      token_.setTemplateLiteral(
2153
526k
          TokenKind::no_substitution_template, cookedStr, rawStr);
2154
526k
    } else {
2155
      // ` characters ${
2156
320
      token_.setTemplateLiteral(TokenKind::template_head, cookedStr, rawStr);
2157
320
    }
2158
526k
  } else {
2159
174k
    if (isTail) {
2160
      // } characters `
2161
314
      token_.setTemplateLiteral(TokenKind::template_tail, cookedStr, rawStr);
2162
174k
    } else {
2163
      // } characters ${
2164
174k
      token_.setTemplateLiteral(TokenKind::template_middle, cookedStr, rawStr);
2165
174k
    }
2166
174k
  }
2167
701k
}
2168
2169
/// TODO: this has to be implemented properly.
2170
61
void JSLexer::scanRegExp() {
2171
61
  SMLoc startLoc = SMLoc::getFromPointer(curCharPtr_);
2172
61
  assert(*curCharPtr_ == '/');
2173
0
  ++curCharPtr_;
2174
2175
61
  tmpStorage_.clear();
2176
61
  bool inClass = false;
2177
2178
405k
  for (;;) {
2179
405k
    switch ((unsigned char)*curCharPtr_) {
2180
61
      case '/':
2181
61
        if (!inClass) {
2182
61
          ++curCharPtr_;
2183
61
          goto exitLoop;
2184
61
        }
2185
0
        break;
2186
2187
118
      case '[':
2188
118
        inClass = true; // It may be true already, but so what.
2189
118
        break;
2190
2191
118
      case ']':
2192
118
        inClass = false; // It may be false already, but so what.
2193
118
        break;
2194
2195
106
      case '\\': // an escape
2196
106
        tmpStorage_.push_back((unsigned char)*curCharPtr_);
2197
106
        ++curCharPtr_;
2198
106
        switch ((unsigned char)*curCharPtr_) {
2199
0
          case '\0':
2200
0
            if (curCharPtr_ == bufferEnd_)
2201
0
              goto unterminated;
2202
0
            break;
2203
0
          case UTF8_LINE_TERMINATOR_CHAR0:
2204
0
            if (matchUnicodeLineTerminatorOffset1(curCharPtr_))
2205
0
              goto unterminated;
2206
0
            break;
2207
0
          case '\n':
2208
0
          case '\r':
2209
0
            goto unterminated;
2210
106
        }
2211
106
        break;
2212
2213
106
      case '\0':
2214
13
        if (curCharPtr_ == bufferEnd_)
2215
0
          goto unterminated;
2216
13
        break;
2217
13
      case UTF8_LINE_TERMINATOR_CHAR0:
2218
0
        if (matchUnicodeLineTerminatorOffset1(curCharPtr_))
2219
0
          goto unterminated;
2220
0
        break;
2221
2222
0
      case '\n':
2223
0
      case '\r':
2224
0
      unterminated:
2225
0
        error(
2226
0
            SMLoc::getFromPointer(curCharPtr_),
2227
0
            "non-terminated regular expression literal");
2228
0
        sm_.note(startLoc, "regular expression started here");
2229
0
        goto exitLoop;
2230
405k
    }
2231
2232
405k
    if (LLVM_UNLIKELY(isUTF8Start((unsigned char)*curCharPtr_)))
2233
4
      appendUnicodeToStorage(_decodeUTF8SlowPath(curCharPtr_));
2234
405k
    else
2235
405k
      tmpStorage_.push_back((unsigned char)*curCharPtr_++);
2236
405k
  }
2237
61
exitLoop:
2238
61
  UniqueString *body = getStringLiteral(tmpStorage_.str());
2239
2240
  // Scan the flags. We must not interpret escape sequences.
2241
  // E6 5.1 7.8.5: "The Strings of characters comprising the
2242
  // RegularExpressionBody and the RegularExpressionFlags are passed
2243
  // uninterpreted to the regular expression constructor"
2244
61
  tmpStorage_.clear();
2245
61
  bool escapingBackslash = false;
2246
255k
  for (;;) {
2247
255k
    if (consumeOneIdentifierPartNoEscape<IdentifierMode::JS>()) {
2248
255k
      escapingBackslash = false;
2249
255k
      continue;
2250
255k
    } else if (*curCharPtr_ == '\\') {
2251
0
      tmpStorage_.push_back(*curCharPtr_++);
2252
2253
      // ES6 11.8.5.1: It is a Syntax Error if IdentifierPart contains a
2254
      // Unicode escape sequence.
2255
0
      escapingBackslash = !escapingBackslash;
2256
0
      if (escapingBackslash && *curCharPtr_ == 'u') {
2257
0
        error(
2258
0
            SMLoc::getFromPointer(curCharPtr_),
2259
0
            "Unicode escape sequences are not allowed in regular expression flags");
2260
0
      }
2261
61
    } else {
2262
61
      break;
2263
61
    }
2264
255k
  }
2265
2266
61
  UniqueString *flags = getStringLiteral(tmpStorage_.str());
2267
2268
61
  token_.setRegExpLiteral(new (allocator_.Allocate<RegExpLiteral>(1))
2269
61
                              RegExpLiteral(body, flags));
2270
61
}
2271
2272
0
UniqueString *JSLexer::convertSurrogatesInString(llvh::StringRef str) {
2273
0
  std::string output;
2274
0
  convertUTF8WithSurrogatesToUTF8WithReplacements(output, str);
2275
0
  return strTab_.getString(output);
2276
0
}
2277
2278
179k
bool JSLexer::error(llvh::SMLoc loc, const llvh::Twine &msg) {
2279
179k
  sm_.error(loc, msg, Subsystem::Lexer);
2280
179k
  if (!sm_.isErrorLimitReached())
2281
143k
    return true;
2282
35.9k
  forceEOF();
2283
35.9k
  return false;
2284
179k
}
2285
2286
26
bool JSLexer::error(llvh::SMRange range, const llvh::Twine &msg) {
2287
26
  sm_.error(range, msg, Subsystem::Lexer);
2288
26
  if (!sm_.isErrorLimitReached())
2289
12
    return true;
2290
14
  forceEOF();
2291
14
  return false;
2292
26
}
2293
2294
bool JSLexer::error(
2295
    llvh::SMLoc loc,
2296
    llvh::SMRange range,
2297
0
    const llvh::Twine &msg) {
2298
0
  sm_.error(loc, range, msg, Subsystem::Lexer);
2299
0
  if (!sm_.isErrorLimitReached())
2300
0
    return true;
2301
0
  forceEOF();
2302
0
  return false;
2303
0
}
2304
2305
} // namespace parser
2306
} // namespace hermes