Coverage Report

Created: 2026-06-13 07:57

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/wabt/src/wast-lexer.cc
Line
Count
Source
1
/*
2
 * Copyright 2016 WebAssembly Community Group participants
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *     http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
17
#include "wabt/wast-lexer.h"
18
19
#include <cassert>
20
#include <cstdio>
21
22
#include "wabt/config.h"
23
24
#include "wabt/lexer-source.h"
25
26
23.5M
#define ERROR(...) Error(GetLocation(), __VA_ARGS__)
27
28
namespace wabt {
29
30
namespace {
31
32
#if __clang__
33
#pragma clang diagnostic push
34
#pragma clang diagnostic ignored "-Wimplicit-fallthrough"
35
#endif
36
#include "prebuilt/lexer-keywords.cc"
37
#if __clang__
38
#pragma clang diagnostic pop
39
#endif
40
41
}  // namespace
42
43
WastLexer::WastLexer(std::unique_ptr<LexerSource> source,
44
                     std::string_view filename,
45
                     Errors* errors)
46
22.8k
    : source_(std::move(source)),
47
22.8k
      filename_(filename),
48
22.8k
      line_(1),
49
22.8k
      buffer_(static_cast<const char*>(source_->data())),
50
22.8k
      buffer_end_(buffer_ + source_->size()),
51
22.8k
      line_start_(buffer_),
52
22.8k
      token_start_(buffer_),
53
22.8k
      cursor_(buffer_),
54
22.8k
      errors_(errors) {}
55
56
// static
57
std::unique_ptr<WastLexer> WastLexer::CreateBufferLexer(
58
    std::string_view filename,
59
    const void* data,
60
    size_t size,
61
22.8k
    Errors* errors) {
62
22.8k
  return std::make_unique<WastLexer>(std::make_unique<LexerSource>(data, size),
63
22.8k
                                     filename, errors);
64
22.8k
}
65
66
27.9M
Token WastLexer::GetToken() {
67
48.5M
  while (true) {
68
48.5M
    token_start_ = cursor_;
69
48.5M
    switch (PeekChar()) {
70
1.97M
      case kEof:
71
1.97M
        return BareToken(TokenType::Eof);
72
73
8.08M
      case '(':
74
8.08M
        if (MatchString("(;")) {
75
392
          if (ReadBlockComment()) {
76
256
            continue;
77
256
          }
78
136
          return BareToken(TokenType::Eof);
79
8.08M
        } else if (MatchString("(@")) {
80
825k
          GetIdChars();
81
          // offset=2 to skip the "(@" prefix
82
825k
          return TextToken(TokenType::LparAnn, 2);
83
7.26M
        } else {
84
7.26M
          ReadChar();
85
7.26M
          return BareToken(TokenType::Lpar);
86
7.26M
        }
87
0
        break;
88
89
2.56M
      case ')':
90
2.56M
        ReadChar();
91
2.56M
        return BareToken(TokenType::Rpar);
92
93
286k
      case ';':
94
286k
        if (MatchString(";;")) {
95
1.19k
          if (ReadLineComment()) {
96
1.17k
            continue;
97
1.17k
          }
98
14
          return BareToken(TokenType::Eof);
99
285k
        } else {
100
285k
          ReadChar();
101
285k
          ERROR("unexpected char");
102
285k
          continue;
103
285k
        }
104
0
        break;
105
106
200k
      case ' ':
107
262k
      case '\t':
108
310k
      case '\r':
109
2.05M
      case '\n':
110
2.05M
        ReadWhitespace();
111
2.05M
        continue;
112
113
523k
      case '"':
114
523k
        return GetStringToken(TokenType::Text);
115
116
24.3k
      case '+':
117
88.3k
      case '-':
118
88.3k
        ReadChar();
119
88.3k
        switch (PeekChar()) {
120
2.48k
          case 'i':
121
2.48k
            return GetInfToken();
122
123
9.48k
          case 'n':
124
9.48k
            return GetNanToken();
125
126
36.8k
          case '0':
127
36.8k
            return MatchString("0x") ? GetHexNumberToken(TokenType::Int)
128
36.8k
                                     : GetNumberToken(TokenType::Int);
129
4.64k
          case '1':
130
6.58k
          case '2':
131
11.0k
          case '3':
132
14.6k
          case '4':
133
15.2k
          case '5':
134
15.9k
          case '6':
135
16.7k
          case '7':
136
18.9k
          case '8':
137
19.6k
          case '9':
138
19.6k
            return GetNumberToken(TokenType::Int);
139
140
19.8k
          default:
141
19.8k
            return GetReservedToken();
142
88.3k
        }
143
0
        break;
144
145
507k
      case '0':
146
507k
        return MatchString("0x") ? GetHexNumberToken(TokenType::Nat)
147
507k
                                 : GetNumberToken(TokenType::Nat);
148
149
232k
      case '1':
150
312k
      case '2':
151
487k
      case '3':
152
543k
      case '4':
153
561k
      case '5':
154
602k
      case '6':
155
618k
      case '7':
156
670k
      case '8':
157
679k
      case '9':
158
679k
        return GetNumberToken(TokenType::Nat);
159
160
3.73M
      case '$':
161
3.73M
        ReadChar();
162
3.73M
        if (PeekChar() == '"') {
163
729
          return GetStringToken(TokenType::Var);
164
729
        }
165
3.73M
        return GetIdChars();  // Initial $ is idchar, so this produces id token
166
167
53.1k
      case 'a':
168
53.1k
        return GetNameEqNumToken("align=", TokenType::AlignEqNat);
169
170
1.32M
      case 'i':
171
1.32M
        return GetInfToken();
172
173
314k
      case 'n':
174
314k
        return GetNanToken();
175
176
85.6k
      case 'o':
177
85.6k
        return GetNameEqNumToken("offset=", TokenType::OffsetEqNat);
178
179
26.2M
      default:
180
26.2M
        if (IsKeyword(PeekChar())) {
181
7.68M
          return GetKeywordToken();
182
18.5M
        } else if (IsIdChar(PeekChar())) {
183
286k
          return GetReservedToken();
184
18.2M
        } else {
185
18.2M
          ReadChar();
186
18.2M
          ERROR("unexpected char");
187
18.2M
          continue;
188
18.2M
        }
189
48.5M
    }
190
48.5M
  }
191
27.9M
}
192
193
52.6M
Location WastLexer::GetLocation() {
194
105M
  auto column = [this](const char* p) {
195
105M
    return std::max(1, static_cast<int>(p - line_start_ + 1));
196
105M
  };
197
52.6M
  return Location(line_, column(token_start_), column(cursor_));
198
52.6M
}
199
200
9.47M
std::string_view WastLexer::GetText(size_t offset) {
201
  // Bounds checks are necessary because token_start may have been moved
202
  // (e.g. if GetStringToken found a newline and reset token_start to
203
  // point at it).
204
205
9.47M
  if (token_start_ + offset >= buffer_end_)
206
65
    return {};
207
208
9.47M
  if (cursor_ <= token_start_ + offset)
209
798k
    return {};
210
211
8.67M
  return std::string_view(token_start_ + offset,
212
8.67M
                          (cursor_ - token_start_) - offset);
213
9.47M
}
214
215
14.6M
Token WastLexer::BareToken(TokenType token_type) {
216
14.6M
  return Token(GetLocation(), token_type);
217
14.6M
}
218
219
1.13M
Token WastLexer::LiteralToken(TokenType token_type, LiteralType literal_type) {
220
1.13M
  return Token(GetLocation(), token_type, Literal(literal_type, GetText()));
221
1.13M
}
222
223
8.33M
Token WastLexer::TextToken(TokenType token_type, size_t offset) {
224
8.33M
  return Token(GetLocation(), token_type, GetText(offset));
225
8.33M
}
226
227
391M
int WastLexer::PeekChar() {
228
391M
  return cursor_ < buffer_end_ ? static_cast<uint8_t>(*cursor_) : kEof;
229
391M
}
230
231
338M
int WastLexer::ReadChar() {
232
338M
  return cursor_ < buffer_end_ ? static_cast<uint8_t>(*cursor_++) : kEof;
233
338M
}
234
235
19.6M
bool WastLexer::MatchChar(char c) {
236
19.6M
  if (PeekChar() == c) {
237
271k
    ReadChar();
238
271k
    return true;
239
271k
  }
240
19.3M
  return false;
241
19.6M
}
242
243
18.8M
bool WastLexer::MatchString(std::string_view s) {
244
18.8M
  const char* saved_cursor = cursor_;
245
38.1M
  for (char c : s) {
246
38.1M
    if (ReadChar() != c) {
247
17.7M
      cursor_ = saved_cursor;
248
17.7M
      return false;
249
17.7M
    }
250
38.1M
  }
251
1.16M
  return true;
252
18.8M
}
253
254
105M
void WastLexer::Newline() {
255
105M
  line_++;
256
105M
  line_start_ = cursor_;
257
105M
}
258
259
392
bool WastLexer::ReadBlockComment() {
260
392
  int nesting = 1;
261
2.40M
  while (true) {
262
2.40M
    switch (ReadChar()) {
263
136
      case kEof:
264
136
        ERROR("EOF in block comment");
265
136
        return false;
266
267
4.70k
      case ';':
268
4.70k
        if (MatchChar(')') && --nesting == 0) {
269
256
          return true;
270
256
        }
271
4.44k
        break;
272
273
41.2k
      case '(':
274
41.2k
        if (MatchChar(';')) {
275
292
          nesting++;
276
292
        }
277
41.2k
        break;
278
279
586k
      case '\n':
280
586k
        Newline();
281
586k
        break;
282
2.40M
    }
283
2.40M
  }
284
392
}
285
286
1.19k
bool WastLexer::ReadLineComment() {
287
115k
  while (true) {
288
115k
    switch (ReadChar()) {
289
14
      case kEof:
290
14
        return false;
291
292
650
      case '\r':
293
650
        if (PeekChar() == '\n') {
294
201
          ReadChar();
295
201
        }
296
650
        Newline();
297
650
        return true;
298
299
528
      case '\n':
300
528
        Newline();
301
528
        return true;
302
115k
    }
303
115k
  }
304
1.19k
}
305
306
2.05M
void WastLexer::ReadWhitespace() {
307
102M
  while (true) {
308
102M
    switch (PeekChar()) {
309
201k
      case ' ':
310
338k
      case '\t':
311
389k
      case '\r':
312
389k
        ReadChar();
313
389k
        break;
314
315
100M
      case '\n':
316
100M
        ReadChar();
317
100M
        Newline();
318
100M
        break;
319
320
2.05M
      default:
321
2.05M
        return;
322
102M
    }
323
102M
  }
324
2.05M
}
325
326
840k
Token WastLexer::GetStringToken(TokenType token_type) {
327
840k
  const char* saved_token_start = token_start_;
328
840k
  bool has_error = false;
329
840k
  bool in_string = true;
330
840k
  ReadChar();
331
26.1M
  while (in_string) {
332
25.3M
    switch (ReadChar()) {
333
994
      case kEof:
334
994
        return BareToken(TokenType::Eof);
335
336
4.37M
      case '\n':
337
4.37M
        token_start_ = cursor_ - 1;
338
4.37M
        ERROR("newline in string");
339
4.37M
        has_error = true;
340
4.37M
        Newline();
341
4.37M
        continue;
342
343
839k
      case '"':
344
839k
        if (PeekChar() == '"') {
345
310k
          ERROR("invalid string token");
346
310k
          has_error = true;
347
310k
        }
348
839k
        in_string = false;
349
839k
        break;
350
351
429k
      case '\\': {
352
429k
        switch (ReadChar()) {
353
1.19k
          case 't':
354
2.03k
          case 'n':
355
2.92k
          case 'r':
356
52.4k
          case '"':
357
53.8k
          case '\'':
358
59.5k
          case '\\':
359
            // Valid escape.
360
59.5k
            break;
361
362
366
          case '0':
363
706
          case '1':
364
923
          case '2':
365
1.15k
          case '3':
366
1.43k
          case '4':
367
1.67k
          case '5':
368
1.88k
          case '6':
369
2.35k
          case '7':
370
2.61k
          case '8':
371
3.00k
          case '9':
372
3.38k
          case 'a':
373
3.59k
          case 'b':
374
4.29k
          case 'c':
375
4.58k
          case 'd':
376
4.79k
          case 'e':
377
5.02k
          case 'f':
378
6.75k
          case 'A':
379
7.02k
          case 'B':
380
346k
          case 'C':
381
347k
          case 'D':
382
347k
          case 'E':
383
348k
          case 'F':  // Hex byte escape.
384
348k
            if (IsHexDigit(PeekChar())) {
385
3.60k
              ReadChar();
386
344k
            } else {
387
344k
              token_start_ = cursor_ - 2;
388
344k
              goto error;
389
344k
            }
390
3.60k
            break;
391
392
11.4k
          case 'u': {
393
11.4k
            token_start_ = cursor_ - 2;
394
11.4k
            if (ReadChar() != '{') {
395
877
              goto error;
396
877
            }
397
398
            // Value must be a valid unicode scalar value.
399
10.6k
            uint32_t digit;
400
10.6k
            uint32_t scalar_value = 0;
401
402
84.9k
            while (IsHexDigit(PeekChar())) {
403
75.1k
              ParseHexdigit(*cursor_++, &digit);
404
405
75.1k
              scalar_value = (scalar_value << 4) | digit;
406
              // Maximum value of a unicode code point.
407
75.1k
              if (scalar_value >= 0x110000) {
408
755
                goto error;
409
755
              }
410
75.1k
            }
411
412
9.84k
            if (PeekChar() != '}') {
413
1.00k
              goto error;
414
1.00k
            }
415
416
            // Scalars between 0xd800 and 0xdfff are not allowed.
417
8.84k
            if ((scalar_value >= 0xd800 && scalar_value < 0xe000) ||
418
8.55k
                token_start_ == cursor_ - 3) {
419
484
              ReadChar();
420
484
              goto error;
421
484
            }
422
8.35k
            break;
423
8.84k
          }
424
425
10.0k
          default:
426
10.0k
            token_start_ = cursor_ - 2;
427
10.0k
            goto error;
428
429
357k
          error:
430
357k
            ERROR("bad escape \"%.*s\"",
431
357k
                  static_cast<int>(cursor_ - token_start_), token_start_);
432
357k
            has_error = true;
433
357k
            break;
434
429k
        }
435
429k
        break;
436
429k
      }
437
25.3M
    }
438
25.3M
  }
439
839k
  token_start_ = saved_token_start;
440
839k
  if (has_error) {
441
311k
    return Token(GetLocation(), TokenType::Invalid);
442
311k
  }
443
444
527k
  return TextToken(token_type);
445
839k
}
446
447
// static
448
215M
bool WastLexer::IsCharClass(int c, CharClass bit) {
449
  // Generated by the following python script:
450
  //
451
  //   def Range(c, lo, hi): return lo <= c <= hi
452
  //   def IsDigit(c): return Range(c, '0', '9')
453
  //   def IsHexDigit(c): return IsDigit(c) or Range(c.lower(), 'a', 'f')
454
  //   def IsKeyword(c): return Range(c, 'a', 'z')
455
  //   def IsIdChar(c): return Range(c, '!', '~') and c not in '"(),;[]{}'
456
  //
457
  //   print ([0] + [
458
  //       (8 if IsDigit(c) else 0) |
459
  //       (4 if IsHexDigit(c) else 0) |
460
  //       (2 if IsKeyword(c) else 0) |
461
  //       (1 if IsIdChar(c) else 0)
462
  //       for c in map(chr, range(0, 127))
463
  //   ])
464
215M
  static const char kCharClasses[257] = {
465
215M
      0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,
466
215M
      0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  1,  0,  1,  1,
467
215M
      1,  1,  1, 0, 0, 1, 1, 0, 1, 1, 1, 13, 13, 13, 13, 13, 13, 13, 13,
468
215M
      13, 13, 1, 0, 1, 1, 1, 1, 1, 5, 5, 5,  5,  5,  5,  1,  1,  1,  1,
469
215M
      1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  0,  1,  0,
470
215M
      1,  1,  1, 7, 7, 7, 7, 7, 7, 3, 3, 3,  3,  3,  3,  3,  3,  3,  3,
471
215M
      3,  3,  3, 3, 3, 3, 3, 3, 3, 3, 0, 1,  0,  1,
472
215M
  };
473
474
215M
  assert(c >= -1 && c < 256);
475
215M
  return (kCharClasses[c + 1] & static_cast<int>(bit)) != 0;
476
215M
}
477
478
7.81M
bool WastLexer::ReadNum() {
479
7.81M
  if (IsDigit(PeekChar())) {
480
7.78M
    ReadChar();
481
7.78M
    return MatchChar('_') || IsDigit(PeekChar()) ? ReadNum() : true;
482
7.78M
  }
483
28.3k
  return false;
484
7.81M
}
485
486
8.14M
bool WastLexer::ReadHexNum() {
487
8.14M
  if (IsHexDigit(PeekChar())) {
488
8.13M
    ReadChar();
489
8.13M
    return MatchChar('_') || IsHexDigit(PeekChar()) ? ReadHexNum() : true;
490
8.13M
  }
491
5.79k
  return false;
492
8.14M
}
493
494
15.7M
WastLexer::ReservedChars WastLexer::ReadReservedChars() {
495
15.7M
  ReservedChars ret{ReservedChars::None};
496
138M
  while (true) {
497
138M
    auto peek = PeekChar();
498
138M
    if (IsIdChar(peek)) {
499
122M
      ReadChar();
500
122M
      if (ret == ReservedChars::None) {
501
12.3M
        ret = ReservedChars::Id;
502
12.3M
      }
503
122M
    } else if (peek == '"') {
504
315k
      GetStringToken(TokenType::Text);
505
315k
      ret = ReservedChars::Some;
506
15.7M
    } else {
507
15.7M
      break;
508
15.7M
    }
509
138M
  }
510
15.7M
  return ret;
511
15.7M
}
512
513
75.8k
void WastLexer::ReadSign() {
514
75.8k
  if (PeekChar() == '+' || PeekChar() == '-') {
515
47.4k
    ReadChar();
516
47.4k
  }
517
75.8k
}
518
519
1.08M
Token WastLexer::GetNumberToken(TokenType token_type) {
520
1.08M
  if (ReadNum()) {
521
1.07M
    if (MatchChar('.')) {
522
91.7k
      token_type = TokenType::Float;
523
91.7k
      if (IsDigit(PeekChar()) && !ReadNum()) {
524
2.03k
        return GetReservedToken();
525
2.03k
      }
526
91.7k
    }
527
1.07M
    if (MatchChar('e') || MatchChar('E')) {
528
24.4k
      token_type = TokenType::Float;
529
24.4k
      ReadSign();
530
24.4k
      if (!ReadNum()) {
531
17.4k
        return GetReservedToken();
532
17.4k
      }
533
24.4k
    }
534
1.05M
    if (NoTrailingReservedChars()) {
535
941k
      if (token_type == TokenType::Float) {
536
62.7k
        return LiteralToken(token_type, LiteralType::Float);
537
878k
      } else {
538
878k
        return LiteralToken(token_type, LiteralType::Int);
539
878k
      }
540
941k
    }
541
1.05M
  }
542
120k
  return GetReservedToken();
543
1.08M
}
544
545
161k
Token WastLexer::GetHexNumberToken(TokenType token_type) {
546
161k
  if (ReadHexNum()) {
547
157k
    if (MatchChar('.')) {
548
40.3k
      token_type = TokenType::Float;
549
40.3k
      if (IsHexDigit(PeekChar()) && !ReadHexNum()) {
550
405
        return GetReservedToken();
551
405
      }
552
40.3k
    }
553
156k
    if (MatchChar('p') || MatchChar('P')) {
554
51.3k
      token_type = TokenType::Float;
555
51.3k
      ReadSign();
556
51.3k
      if (!ReadNum()) {
557
4.00k
        return GetReservedToken();
558
4.00k
      }
559
51.3k
    }
560
152k
    if (NoTrailingReservedChars()) {
561
135k
      if (token_type == TokenType::Float) {
562
82.0k
        return LiteralToken(token_type, LiteralType::Hexfloat);
563
82.0k
      } else {
564
53.2k
        return LiteralToken(token_type, LiteralType::Int);
565
53.2k
      }
566
135k
    }
567
152k
  }
568
21.7k
  return GetReservedToken();
569
161k
}
570
571
1.33M
Token WastLexer::GetInfToken() {
572
1.33M
  if (MatchString("inf")) {
573
26.1k
    if (NoTrailingReservedChars()) {
574
25.2k
      return LiteralToken(TokenType::Float, LiteralType::Infinity);
575
25.2k
    }
576
878
    return GetReservedToken();
577
26.1k
  }
578
1.30M
  return GetKeywordToken();
579
1.33M
}
580
581
323k
Token WastLexer::GetNanToken() {
582
323k
  if (MatchString("nan")) {
583
33.2k
    if (MatchChar(':')) {
584
24.3k
      if (MatchString("0x") && ReadHexNum() && NoTrailingReservedChars()) {
585
19.6k
        return LiteralToken(TokenType::Float, LiteralType::Nan);
586
19.6k
      }
587
24.3k
    } else if (NoTrailingReservedChars()) {
588
8.44k
      return LiteralToken(TokenType::Float, LiteralType::Nan);
589
8.44k
    }
590
33.2k
  }
591
295k
  return GetKeywordToken();
592
323k
}
593
594
Token WastLexer::GetNameEqNumToken(std::string_view name,
595
138k
                                   TokenType token_type) {
596
138k
  if (MatchString(name)) {
597
62.9k
    if (MatchString("0x")) {
598
37.1k
      if (ReadHexNum() && NoTrailingReservedChars()) {
599
31.3k
        return TextToken(token_type, name.size());
600
31.3k
      }
601
37.1k
    } else if (ReadNum() && NoTrailingReservedChars()) {
602
22.1k
      return TextToken(token_type, name.size());
603
22.1k
    }
604
62.9k
  }
605
85.2k
  return GetKeywordToken();
606
138k
}
607
608
4.55M
Token WastLexer::GetIdChars() {
609
4.55M
  if (ReadReservedChars() != ReservedChars::Some) {
610
4.55M
    return TextToken(TokenType::Var);
611
4.55M
  }
612
613
1.58k
  return TextToken(TokenType::Reserved);
614
4.55M
}
615
616
9.37M
Token WastLexer::GetKeywordToken() {
617
9.37M
  ReadReservedChars();
618
9.37M
  TokenInfo* info =
619
9.37M
      Perfect_Hash::InWordSet(token_start_, cursor_ - token_start_);
620
9.37M
  if (!info) {
621
1.90M
    return TextToken(TokenType::Reserved);
622
1.90M
  }
623
7.47M
  if (IsTokenTypeBare(info->token_type)) {
624
2.81M
    return BareToken(info->token_type);
625
4.65M
  } else if (IsTokenTypeType(info->token_type) ||
626
4.07M
             IsTokenTypeRefKind(info->token_type)) {
627
3.28M
    return Token(GetLocation(), info->token_type, info->value_type);
628
3.28M
  } else {
629
1.36M
    assert(IsTokenTypeOpcode(info->token_type));
630
1.36M
    return Token(GetLocation(), info->token_type, info->opcode);
631
1.36M
  }
632
7.47M
}
633
634
473k
Token WastLexer::GetReservedToken() {
635
473k
  ReadReservedChars();
636
473k
  return TextToken(TokenType::Reserved);
637
473k
}
638
639
23.5M
void WastLexer::Error(Location loc, const char* format, ...) {
640
  WABT_SNPRINTF_ALLOCA(buffer, length, format);
641
23.5M
  errors_->emplace_back(ErrorLevel::Error, loc, filename_, buffer);
642
23.5M
}
643
644
}  // namespace wabt