Coverage Report

Created: 2025-08-28 06:17

/src/wabt/src/wast-lexer.cc
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright 2016 WebAssembly Community Group participants
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *     http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
17
#include "wabt/wast-lexer.h"
18
19
#include <cassert>
20
#include <cstdio>
21
22
#include "wabt/config.h"
23
24
#include "wabt/lexer-source.h"
25
26
25.6M
#define ERROR(...) Error(GetLocation(), __VA_ARGS__)
27
28
namespace wabt {
29
30
namespace {
31
32
#if __clang__
33
#pragma clang diagnostic push
34
#pragma clang diagnostic ignored "-Wimplicit-fallthrough"
35
#endif
36
#include "prebuilt/lexer-keywords.cc"
37
#if __clang__
38
#pragma clang diagnostic pop
39
#endif
40
41
}  // namespace
42
43
WastLexer::WastLexer(std::unique_ptr<LexerSource> source,
44
                     std::string_view filename,
45
                     Errors* errors)
46
23.5k
    : source_(std::move(source)),
47
23.5k
      filename_(filename),
48
23.5k
      line_(1),
49
23.5k
      buffer_(static_cast<const char*>(source_->data())),
50
23.5k
      buffer_end_(buffer_ + source_->size()),
51
23.5k
      line_start_(buffer_),
52
23.5k
      token_start_(buffer_),
53
23.5k
      cursor_(buffer_),
54
23.5k
      errors_(errors) {}
55
56
// static
57
std::unique_ptr<WastLexer> WastLexer::CreateBufferLexer(
58
    std::string_view filename,
59
    const void* data,
60
    size_t size,
61
23.5k
    Errors* errors) {
62
23.5k
  return std::make_unique<WastLexer>(std::make_unique<LexerSource>(data, size),
63
23.5k
                                     filename, errors);
64
23.5k
}
65
66
24.3M
Token WastLexer::GetToken() {
67
43.5M
  while (true) {
68
43.5M
    token_start_ = cursor_;
69
43.5M
    switch (PeekChar()) {
70
1.49M
      case kEof:
71
1.49M
        return BareToken(TokenType::Eof);
72
73
7.98M
      case '(':
74
7.98M
        if (MatchString("(;")) {
75
6.13k
          if (ReadBlockComment()) {
76
5.98k
            continue;
77
5.98k
          }
78
145
          return BareToken(TokenType::Eof);
79
7.97M
        } else if (MatchString("(@")) {
80
1.31M
          GetIdChars();
81
          // offset=2 to skip the "(@" prefix
82
1.31M
          return TextToken(TokenType::LparAnn, 2);
83
6.66M
        } else {
84
6.66M
          ReadChar();
85
6.66M
          return BareToken(TokenType::Lpar);
86
6.66M
        }
87
0
        break;
88
89
1.83M
      case ')':
90
1.83M
        ReadChar();
91
1.83M
        return BareToken(TokenType::Rpar);
92
93
395k
      case ';':
94
395k
        if (MatchString(";;")) {
95
1.15k
          if (ReadLineComment()) {
96
1.13k
            continue;
97
1.13k
          }
98
15
          return BareToken(TokenType::Eof);
99
394k
        } else {
100
394k
          ReadChar();
101
394k
          ERROR("unexpected char");
102
394k
          continue;
103
394k
        }
104
0
        break;
105
106
206k
      case ' ':
107
234k
      case '\t':
108
359k
      case '\r':
109
981k
      case '\n':
110
981k
        ReadWhitespace();
111
981k
        continue;
112
113
569k
      case '"':
114
569k
        return GetStringToken();
115
116
76.7k
      case '+':
117
153k
      case '-':
118
153k
        ReadChar();
119
153k
        switch (PeekChar()) {
120
2.02k
          case 'i':
121
2.02k
            return GetInfToken();
122
123
11.5k
          case 'n':
124
11.5k
            return GetNanToken();
125
126
6.60k
          case '0':
127
6.60k
            return MatchString("0x") ? GetHexNumberToken(TokenType::Int)
128
6.60k
                                     : GetNumberToken(TokenType::Int);
129
9.04k
          case '1':
130
10.7k
          case '2':
131
22.6k
          case '3':
132
30.8k
          case '4':
133
31.1k
          case '5':
134
31.7k
          case '6':
135
33.4k
          case '7':
136
35.6k
          case '8':
137
36.0k
          case '9':
138
36.0k
            return GetNumberToken(TokenType::Int);
139
140
96.7k
          default:
141
96.7k
            return GetReservedToken();
142
153k
        }
143
0
        break;
144
145
648k
      case '0':
146
648k
        return MatchString("0x") ? GetHexNumberToken(TokenType::Nat)
147
648k
                                 : GetNumberToken(TokenType::Nat);
148
149
226k
      case '1':
150
294k
      case '2':
151
596k
      case '3':
152
620k
      case '4':
153
639k
      case '5':
154
667k
      case '6':
155
678k
      case '7':
156
771k
      case '8':
157
775k
      case '9':
158
775k
        return GetNumberToken(TokenType::Nat);
159
160
2.40M
      case '$':
161
2.40M
        return GetIdChars();  // Initial $ is idchar, so this produces id token
162
163
77.3k
      case 'a':
164
77.3k
        return GetNameEqNumToken("align=", TokenType::AlignEqNat);
165
166
1.27M
      case 'i':
167
1.27M
        return GetInfToken();
168
169
369k
      case 'n':
170
369k
        return GetNanToken();
171
172
58.8k
      case 'o':
173
58.8k
        return GetNameEqNumToken("offset=", TokenType::OffsetEqNat);
174
175
24.5M
      default:
176
24.5M
        if (IsKeyword(PeekChar())) {
177
6.30M
          return GetKeywordToken();
178
18.2M
        } else if (IsIdChar(PeekChar())) {
179
351k
          return GetReservedToken();
180
17.8M
        } else {
181
17.8M
          ReadChar();
182
17.8M
          ERROR("unexpected char");
183
17.8M
          continue;
184
17.8M
        }
185
43.5M
    }
186
43.5M
  }
187
24.3M
}
188
189
52.5M
Location WastLexer::GetLocation() {
190
105M
  auto column = [this](const char* p) {
191
105M
    return std::max(1, static_cast<int>(p - line_start_ + 1));
192
105M
  };
193
52.5M
  return Location(filename_, line_, column(token_start_), column(cursor_));
194
52.5M
}
195
196
9.19M
std::string_view WastLexer::GetText(size_t offset) {
197
  // Bounds checks are necessary because token_start may have been moved
198
  // (e.g. if GetStringToken found a newline and reset token_start to
199
  // point at it).
200
201
9.19M
  if (token_start_ + offset >= buffer_end_)
202
104
    return {};
203
204
9.19M
  if (cursor_ <= token_start_ + offset)
205
1.25M
    return {};
206
207
7.93M
  return std::string_view(token_start_ + offset,
208
7.93M
                          (cursor_ - token_start_) - offset);
209
9.19M
}
210
211
12.2M
Token WastLexer::BareToken(TokenType token_type) {
212
12.2M
  return Token(GetLocation(), token_type);
213
12.2M
}
214
215
1.40M
Token WastLexer::LiteralToken(TokenType token_type, LiteralType literal_type) {
216
1.40M
  return Token(GetLocation(), token_type, Literal(literal_type, GetText()));
217
1.40M
}
218
219
7.79M
Token WastLexer::TextToken(TokenType token_type, size_t offset) {
220
7.79M
  return Token(GetLocation(), token_type, GetText(offset));
221
7.79M
}
222
223
373M
int WastLexer::PeekChar() {
224
373M
  return cursor_ < buffer_end_ ? static_cast<uint8_t>(*cursor_) : kEof;
225
373M
}
226
227
338M
int WastLexer::ReadChar() {
228
338M
  return cursor_ < buffer_end_ ? static_cast<uint8_t>(*cursor_++) : kEof;
229
338M
}
230
231
22.8M
bool WastLexer::MatchChar(char c) {
232
22.8M
  if (PeekChar() == c) {
233
1.08M
    ReadChar();
234
1.08M
    return true;
235
1.08M
  }
236
21.7M
  return false;
237
22.8M
}
238
239
18.8M
bool WastLexer::MatchString(std::string_view s) {
240
18.8M
  const char* saved_cursor = cursor_;
241
37.9M
  for (char c : s) {
242
37.9M
    if (ReadChar() != c) {
243
17.1M
      cursor_ = saved_cursor;
244
17.1M
      return false;
245
17.1M
    }
246
37.9M
  }
247
1.73M
  return true;
248
18.8M
}
249
250
74.1M
void WastLexer::Newline() {
251
74.1M
  line_++;
252
74.1M
  line_start_ = cursor_;
253
74.1M
}
254
255
6.13k
bool WastLexer::ReadBlockComment() {
256
6.13k
  int nesting = 1;
257
2.88M
  while (true) {
258
2.88M
    switch (ReadChar()) {
259
145
      case kEof:
260
145
        ERROR("EOF in block comment");
261
145
        return false;
262
263
116k
      case ';':
264
116k
        if (MatchChar(')') && --nesting == 0) {
265
5.98k
          return true;
266
5.98k
        }
267
110k
        break;
268
269
678k
      case '(':
270
678k
        if (MatchChar(';')) {
271
589k
          nesting++;
272
589k
        }
273
678k
        break;
274
275
607k
      case '\n':
276
607k
        Newline();
277
607k
        break;
278
2.88M
    }
279
2.88M
  }
280
6.13k
}
281
282
1.15k
bool WastLexer::ReadLineComment() {
283
2.01k
  while (true) {
284
2.01k
    switch (ReadChar()) {
285
15
      case kEof:
286
15
        return false;
287
288
900
      case '\r':
289
900
        if (PeekChar() == '\n') {
290
415
          ReadChar();
291
415
        }
292
900
        Newline();
293
900
        return true;
294
295
235
      case '\n':
296
235
        Newline();
297
235
        return true;
298
2.01k
    }
299
2.01k
  }
300
1.15k
}
301
302
981k
void WastLexer::ReadWhitespace() {
303
68.6M
  while (true) {
304
68.6M
    switch (PeekChar()) {
305
248k
      case ' ':
306
331k
      case '\t':
307
463k
      case '\r':
308
463k
        ReadChar();
309
463k
        break;
310
311
67.2M
      case '\n':
312
67.2M
        ReadChar();
313
67.2M
        Newline();
314
67.2M
        break;
315
316
981k
      default:
317
981k
        return;
318
68.6M
    }
319
68.6M
  }
320
981k
}
321
322
1.83M
Token WastLexer::GetStringToken() {
323
1.83M
  const char* saved_token_start = token_start_;
324
1.83M
  bool has_error = false;
325
1.83M
  bool in_string = true;
326
1.83M
  ReadChar();
327
41.9M
  while (in_string) {
328
40.1M
    switch (ReadChar()) {
329
912
      case kEof:
330
912
        return BareToken(TokenType::Eof);
331
332
6.27M
      case '\n':
333
6.27M
        token_start_ = cursor_ - 1;
334
6.27M
        ERROR("newline in string");
335
6.27M
        has_error = true;
336
6.27M
        Newline();
337
6.27M
        continue;
338
339
1.83M
      case '"':
340
1.83M
        if (PeekChar() == '"') {
341
1.09M
          ERROR("invalid string token");
342
1.09M
          has_error = true;
343
1.09M
        }
344
1.83M
        in_string = false;
345
1.83M
        break;
346
347
142k
      case '\\': {
348
142k
        switch (ReadChar()) {
349
995
          case 't':
350
3.12k
          case 'n':
351
4.13k
          case 'r':
352
5.23k
          case '"':
353
6.08k
          case '\'':
354
14.6k
          case '\\':
355
            // Valid escape.
356
14.6k
            break;
357
358
475
          case '0':
359
75.6k
          case '1':
360
76.3k
          case '2':
361
77.0k
          case '3':
362
77.4k
          case '4':
363
77.8k
          case '5':
364
81.9k
          case '6':
365
83.5k
          case '7':
366
83.8k
          case '8':
367
84.0k
          case '9':
368
91.0k
          case 'a':
369
94.4k
          case 'b':
370
102k
          case 'c':
371
102k
          case 'd':
372
102k
          case 'e':
373
103k
          case 'f':
374
103k
          case 'A':
375
103k
          case 'B':
376
104k
          case 'C':
377
104k
          case 'D':
378
105k
          case 'E':
379
105k
          case 'F':  // Hex byte escape.
380
105k
            if (IsHexDigit(PeekChar())) {
381
87.4k
              ReadChar();
382
87.4k
            } else {
383
17.9k
              token_start_ = cursor_ - 2;
384
17.9k
              goto error;
385
17.9k
            }
386
87.4k
            break;
387
388
87.4k
          case 'u': {
389
16.3k
            token_start_ = cursor_ - 2;
390
16.3k
            if (ReadChar() != '{') {
391
919
              goto error;
392
919
            }
393
394
            // Value must be a valid unicode scalar value.
395
15.4k
            uint32_t digit;
396
15.4k
            uint32_t scalar_value = 0;
397
398
231k
            while (IsHexDigit(PeekChar())) {
399
219k
              ParseHexdigit(*cursor_++, &digit);
400
401
219k
              scalar_value = (scalar_value << 4) | digit;
402
              // Maximum value of a unicode code point.
403
219k
              if (scalar_value >= 0x110000) {
404
3.95k
                goto error;
405
3.95k
              }
406
219k
            }
407
408
11.4k
            if (PeekChar() != '}') {
409
3.85k
              goto error;
410
3.85k
            }
411
412
            // Scalars between 0xd800 and 0xdfff are not allowed.
413
7.61k
            if ((scalar_value >= 0xd800 && scalar_value < 0xe000) ||
414
7.61k
                token_start_ == cursor_ - 3) {
415
408
              ReadChar();
416
408
              goto error;
417
408
            }
418
7.21k
            break;
419
7.61k
          }
420
421
7.21k
          default:
422
5.65k
            token_start_ = cursor_ - 2;
423
5.65k
            goto error;
424
425
32.7k
          error:
426
32.7k
            ERROR("bad escape \"%.*s\"",
427
32.7k
                  static_cast<int>(cursor_ - token_start_), token_start_);
428
32.7k
            has_error = true;
429
32.7k
            break;
430
142k
        }
431
142k
        break;
432
142k
      }
433
40.1M
    }
434
40.1M
  }
435
1.83M
  token_start_ = saved_token_start;
436
1.83M
  if (has_error) {
437
1.09M
    return Token(GetLocation(), TokenType::Invalid);
438
1.09M
  }
439
440
739k
  return TextToken(TokenType::Text);
441
1.83M
}
442
443
// static
444
235M
bool WastLexer::IsCharClass(int c, CharClass bit) {
445
  // Generated by the following python script:
446
  //
447
  //   def Range(c, lo, hi): return lo <= c <= hi
448
  //   def IsDigit(c): return Range(c, '0', '9')
449
  //   def IsHexDigit(c): return IsDigit(c) or Range(c.lower(), 'a', 'f')
450
  //   def IsKeyword(c): return Range(c, 'a', 'z')
451
  //   def IsIdChar(c): return Range(c, '!', '~') and c not in '"(),;[]{}'
452
  //
453
  //   print ([0] + [
454
  //       (8 if IsDigit(c) else 0) |
455
  //       (4 if IsHexDigit(c) else 0) |
456
  //       (2 if IsKeyword(c) else 0) |
457
  //       (1 if IsIdChar(c) else 0)
458
  //       for c in map(chr, range(0, 127))
459
  //   ])
460
235M
  static const char kCharClasses[257] = {
461
235M
      0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,
462
235M
      0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  1,  0,  1,  1,
463
235M
      1,  1,  1, 0, 0, 1, 1, 0, 1, 1, 1, 13, 13, 13, 13, 13, 13, 13, 13,
464
235M
      13, 13, 1, 0, 1, 1, 1, 1, 1, 5, 5, 5,  5,  5,  5,  1,  1,  1,  1,
465
235M
      1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  0,  1,  0,
466
235M
      1,  1,  1, 7, 7, 7, 7, 7, 7, 3, 3, 3,  3,  3,  3,  3,  3,  3,  3,
467
235M
      3,  3,  3, 3, 3, 3, 3, 3, 3, 3, 0, 1,  0,  1,
468
235M
  };
469
470
235M
  assert(c >= -1 && c < 256);
471
235M
  return (kCharClasses[c + 1] & static_cast<int>(bit)) != 0;
472
235M
}
473
474
9.91M
bool WastLexer::ReadNum() {
475
9.91M
  if (IsDigit(PeekChar())) {
476
9.89M
    ReadChar();
477
9.89M
    return MatchChar('_') || IsDigit(PeekChar()) ? ReadNum() : true;
478
9.89M
  }
479
16.6k
  return false;
480
9.91M
}
481
482
7.76M
bool WastLexer::ReadHexNum() {
483
7.76M
  if (IsHexDigit(PeekChar())) {
484
7.76M
    ReadChar();
485
7.76M
    return MatchChar('_') || IsHexDigit(PeekChar()) ? ReadHexNum() : true;
486
7.76M
  }
487
4.50k
  return false;
488
7.76M
}
489
490
13.8M
WastLexer::ReservedChars WastLexer::ReadReservedChars() {
491
13.8M
  ReservedChars ret{ReservedChars::None};
492
157M
  while (true) {
493
157M
    auto peek = PeekChar();
494
157M
    if (IsIdChar(peek)) {
495
142M
      ReadChar();
496
142M
      if (ret == ReservedChars::None) {
497
10.9M
        ret = ReservedChars::Id;
498
10.9M
      }
499
142M
    } else if (peek == '"') {
500
1.26M
      GetStringToken();
501
1.26M
      ret = ReservedChars::Some;
502
13.8M
    } else {
503
13.8M
      break;
504
13.8M
    }
505
157M
  }
506
13.8M
  return ret;
507
13.8M
}
508
509
95.4k
void WastLexer::ReadSign() {
510
95.4k
  if (PeekChar() == '+' || PeekChar() == '-') {
511
71.1k
    ReadChar();
512
71.1k
  }
513
95.4k
}
514
515
1.22M
Token WastLexer::GetNumberToken(TokenType token_type) {
516
1.22M
  if (ReadNum()) {
517
1.22M
    if (MatchChar('.')) {
518
121k
      token_type = TokenType::Float;
519
121k
      if (IsDigit(PeekChar()) && !ReadNum()) {
520
459
        return GetReservedToken();
521
459
      }
522
121k
    }
523
1.22M
    if (MatchChar('e') || MatchChar('E')) {
524
17.3k
      token_type = TokenType::Float;
525
17.3k
      ReadSign();
526
17.3k
      if (!ReadNum()) {
527
3.08k
        return GetReservedToken();
528
3.08k
      }
529
17.3k
    }
530
1.22M
    if (NoTrailingReservedChars()) {
531
1.10M
      if (token_type == TokenType::Float) {
532
113k
        return LiteralToken(token_type, LiteralType::Float);
533
994k
      } else {
534
994k
        return LiteralToken(token_type, LiteralType::Int);
535
994k
      }
536
1.10M
    }
537
1.22M
  }
538
116k
  return GetReservedToken();
539
1.22M
}
540
541
237k
Token WastLexer::GetHexNumberToken(TokenType token_type) {
542
237k
  if (ReadHexNum()) {
543
235k
    if (MatchChar('.')) {
544
90.2k
      token_type = TokenType::Float;
545
90.2k
      if (IsHexDigit(PeekChar()) && !ReadHexNum()) {
546
266
        return GetReservedToken();
547
266
      }
548
90.2k
    }
549
234k
    if (MatchChar('p') || MatchChar('P')) {
550
78.0k
      token_type = TokenType::Float;
551
78.0k
      ReadSign();
552
78.0k
      if (!ReadNum()) {
553
10.9k
        return GetReservedToken();
554
10.9k
      }
555
78.0k
    }
556
224k
    if (NoTrailingReservedChars()) {
557
197k
      if (token_type == TokenType::Float) {
558
141k
        return LiteralToken(token_type, LiteralType::Hexfloat);
559
141k
      } else {
560
56.2k
        return LiteralToken(token_type, LiteralType::Int);
561
56.2k
      }
562
197k
    }
563
224k
  }
564
29.0k
  return GetReservedToken();
565
237k
}
566
567
1.28M
Token WastLexer::GetInfToken() {
568
1.28M
  if (MatchString("inf")) {
569
58.6k
    if (NoTrailingReservedChars()) {
570
57.2k
      return LiteralToken(TokenType::Float, LiteralType::Infinity);
571
57.2k
    }
572
1.39k
    return GetReservedToken();
573
58.6k
  }
574
1.22M
  return GetKeywordToken();
575
1.28M
}
576
577
380k
Token WastLexer::GetNanToken() {
578
380k
  if (MatchString("nan")) {
579
41.5k
    if (MatchChar(':')) {
580
32.4k
      if (MatchString("0x") && ReadHexNum() && NoTrailingReservedChars()) {
581
28.5k
        return LiteralToken(TokenType::Float, LiteralType::Nan);
582
28.5k
      }
583
32.4k
    } else if (NoTrailingReservedChars()) {
584
8.30k
      return LiteralToken(TokenType::Float, LiteralType::Nan);
585
8.30k
    }
586
41.5k
  }
587
343k
  return GetKeywordToken();
588
380k
}
589
590
Token WastLexer::GetNameEqNumToken(std::string_view name,
591
136k
                                   TokenType token_type) {
592
136k
  if (MatchString(name)) {
593
30.4k
    if (MatchString("0x")) {
594
11.4k
      if (ReadHexNum() && NoTrailingReservedChars()) {
595
8.93k
        return TextToken(token_type, name.size());
596
8.93k
      }
597
19.0k
    } else if (ReadNum() && NoTrailingReservedChars()) {
598
12.9k
      return TextToken(token_type, name.size());
599
12.9k
    }
600
30.4k
  }
601
114k
  return GetKeywordToken();
602
136k
}
603
604
3.72M
Token WastLexer::GetIdChars() {
605
3.72M
  if (ReadReservedChars() == ReservedChars::Id) {
606
2.46M
    return TextToken(TokenType::Var);
607
2.46M
  }
608
609
1.25M
  return TextToken(TokenType::Reserved);
610
3.72M
}
611
612
7.98M
Token WastLexer::GetKeywordToken() {
613
7.98M
  ReadReservedChars();
614
7.98M
  TokenInfo* info =
615
7.98M
      Perfect_Hash::InWordSet(token_start_, cursor_ - token_start_);
616
7.98M
  if (!info) {
617
1.38M
    return TextToken(TokenType::Reserved);
618
1.38M
  }
619
6.60M
  if (IsTokenTypeBare(info->token_type)) {
620
2.22M
    return BareToken(info->token_type);
621
4.37M
  } else if (IsTokenTypeType(info->token_type) ||
622
4.37M
             IsTokenTypeRefKind(info->token_type)) {
623
3.00M
    return Token(GetLocation(), info->token_type, info->value_type);
624
3.00M
  } else {
625
1.37M
    assert(IsTokenTypeOpcode(info->token_type));
626
1.37M
    return Token(GetLocation(), info->token_type, info->opcode);
627
1.37M
  }
628
6.60M
}
629
630
609k
Token WastLexer::GetReservedToken() {
631
609k
  ReadReservedChars();
632
609k
  return TextToken(TokenType::Reserved);
633
609k
}
634
635
25.6M
void WastLexer::Error(Location loc, const char* format, ...) {
636
25.6M
  WABT_SNPRINTF_ALLOCA(buffer, length, format);
637
25.6M
  errors_->emplace_back(ErrorLevel::Error, loc, buffer);
638
25.6M
}
639
640
}  // namespace wabt