Coverage Report

Created: 2026-04-09 07:06

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/jsonnet/core/lexer.cpp
Line
Count
Source
1
/*
2
Copyright 2015 Google Inc. All rights reserved.
3
4
Licensed under the Apache License, Version 2.0 (the "License");
5
you may not use this file except in compliance with the License.
6
You may obtain a copy of the License at
7
8
    http://www.apache.org/licenses/LICENSE-2.0
9
10
Unless required by applicable law or agreed to in writing, software
11
distributed under the License is distributed on an "AS IS" BASIS,
12
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
See the License for the specific language governing permissions and
14
limitations under the License.
15
*/
16
17
#include <cassert>
18
19
#include <map>
20
#include <sstream>
21
#include <string>
22
23
#include "lexer.h"
24
#include "static_error.h"
25
#include "unicode.h"
26
27
namespace jsonnet::internal {
28
29
static const std::vector<std::string> EMPTY;
30
31
/** Is the char whitespace (excluding \n). */
32
static bool is_horz_ws(char c)
33
806M
{
34
806M
    return c == ' ' || c == '\t' || c == '\r';
35
806M
}
36
37
/** Is the char whitespace. */
38
static bool is_ws(char c)
39
724M
{
40
724M
    return c == '\n' || is_horz_ws(c);
41
724M
}
42
43
/** Strip whitespace from both ends of a string, but only up to margin on the left hand side. */
44
static std::string strip_ws(const std::string &s, unsigned margin)
45
19.5M
{
46
19.5M
    if (s.size() == 0)
47
16.4M
        return s;  // Avoid underflow below.
48
3.04M
    size_t i = 0;
49
8.22M
    while (i < s.length() && is_horz_ws(s[i]) && i < margin)
50
5.18M
        i++;
51
3.04M
    size_t j = s.size();
52
7.18M
    while (j > i && is_horz_ws(s[j - 1])) {
53
4.13M
        j--;
54
4.13M
    }
55
3.04M
    return std::string(&s[i], &s[j]);
56
19.5M
}
57
58
/** Split a string by \n and also strip left (up to margin) & right whitespace from each line. */
59
static std::vector<std::string> line_split(const std::string &s, unsigned margin)
60
320k
{
61
320k
    std::vector<std::string> ret;
62
320k
    std::stringstream ss;
63
118M
    for (size_t i = 0; i < s.length(); ++i) {
64
118M
        if (s[i] == '\n') {
65
19.2M
            ret.emplace_back(strip_ws(ss.str(), margin));
66
19.2M
            ss.str("");
67
99.2M
        } else {
68
99.2M
            ss << s[i];
69
99.2M
        }
70
118M
    }
71
320k
    ret.emplace_back(strip_ws(ss.str(), margin));
72
320k
    return ret;
73
320k
}
74
75
/** Consume whitespace.
76
 *
77
 * Return number of \n and number of spaces after last \n.  Convert \t to spaces.
78
 */
79
static void lex_ws(const char *&c, unsigned &new_lines, unsigned &indent, const char *&line_start,
80
                   unsigned long &line_number)
81
313M
{
82
313M
    indent = 0;
83
313M
    new_lines = 0;
84
724M
    for (; *c != '\0' && is_ws(*c); c++) {
85
411M
        switch (*c) {
86
768k
            case '\r':
87
                // Ignore.
88
768k
                break;
89
90
45.5M
            case '\n':
91
45.5M
                indent = 0;
92
45.5M
                new_lines++;
93
45.5M
                line_number++;
94
45.5M
                line_start = c + 1;
95
45.5M
                break;
96
97
365M
            case ' ': indent += 1; break;
98
99
            // This only works for \t at the beginning of lines, but we strip it everywhere else
100
            // anyway.  The only case where this will cause a problem is spaces followed by \t
101
            // at the beginning of a line.  However that is rare, ill-advised, and if re-indentation
102
            // is enabled it will be fixed later.
103
44.3k
            case '\t': indent += 8; break;
104
411M
        }
105
411M
    }
106
313M
}
107
108
/**
109
# Consume all text until the end of the line, return number of newlines after that and indent
110
*/
111
static void lex_until_newline(const char *&c, std::string &text, unsigned &blanks, unsigned &indent,
112
                              const char *&line_start, unsigned long &line_number)
113
8.23M
{
114
8.23M
    const char *original_c = c;
115
8.23M
    const char *last_non_space = c;
116
120M
    for (; *c != '\0' && *c != '\n'; c++) {
117
112M
        if (!is_horz_ws(*c))
118
96.2M
            last_non_space = c;
119
112M
    }
120
8.23M
    text = std::string(original_c, last_non_space - original_c + 1);
121
    // Consume subsequent whitespace including the '\n'.
122
8.23M
    unsigned new_lines;
123
8.23M
    lex_ws(c, new_lines, indent, line_start, line_number);
124
8.23M
    blanks = new_lines == 0 ? 0 : new_lines - 1;
125
8.23M
}
126
127
static bool is_upper(char c)
128
864M
{
129
864M
    return c >= 'A' && c <= 'Z';
130
864M
}
131
132
static bool is_lower(char c)
133
850M
{
134
850M
    return c >= 'a' && c <= 'z';
135
850M
}
136
137
static bool is_number(char c)
138
154M
{
139
154M
    return c >= '0' && c <= '9';
140
154M
}
141
142
static bool is_identifier_first(char c)
143
864M
{
144
864M
    return is_upper(c) || is_lower(c) || c == '_';
145
864M
}
146
147
static bool is_identifier(char c)
148
694M
{
149
694M
    return is_identifier_first(c) || is_number(c);
150
694M
}
151
152
static bool is_symbol(char c)
153
145M
{
154
145M
    switch (c) {
155
3.12M
        case '!':
156
3.54M
        case '$':
157
19.1M
        case ':':
158
20.2M
        case '~':
159
40.7M
        case '+':
160
43.4M
        case '-':
161
46.2M
        case '&':
162
47.8M
        case '|':
163
47.9M
        case '^':
164
76.8M
        case '=':
165
79.5M
        case '<':
166
83.0M
        case '>':
167
101M
        case '*':
168
105M
        case '/':
169
106M
        case '%': return true;
170
145M
    }
171
38.9M
    return false;
172
145M
}
173
174
14.9M
bool allowed_at_end_of_operator(char c) {
175
14.9M
    switch (c) {
176
3.72M
        case '+':
177
4.02M
        case '-':
178
5.00M
        case '~':
179
5.56M
        case '!':
180
5.75M
        case '$': return false;
181
14.9M
    }
182
9.16M
    return true;
183
14.9M
}
184
185
static const std::map<std::string, Token::Kind> keywords = {
186
    {"assert", Token::ASSERT},
187
    {"else", Token::ELSE},
188
    {"error", Token::ERROR},
189
    {"false", Token::FALSE},
190
    {"for", Token::FOR},
191
    {"function", Token::FUNCTION},
192
    {"if", Token::IF},
193
    {"import", Token::IMPORT},
194
    {"importstr", Token::IMPORTSTR},
195
    {"importbin", Token::IMPORTBIN},
196
    {"in", Token::IN},
197
    {"local", Token::LOCAL},
198
    {"null", Token::NULL_LIT},
199
    {"self", Token::SELF},
200
    {"super", Token::SUPER},
201
    {"tailstrict", Token::TAILSTRICT},
202
    {"then", Token::THEN},
203
    {"true", Token::TRUE},
204
};
205
206
Token::Kind lex_get_keyword_kind(const std::string &identifier)
207
128M
{
208
128M
    auto it = keywords.find(identifier);
209
128M
    if (it == keywords.end())
210
93.9M
        return Token::IDENTIFIER;
211
34.3M
    return it->second;
212
128M
}
213
214
std::string lex_number(const char *&c, const std::string &filename, const Location &begin)
215
13.4M
{
216
    // This function should be understood with reference to the linked image:
217
    // https://www.json.org/img/number.png
218
219
    // Note, we deviate from the json.org documentation as follows:
220
    // * There is no reason to lex negative numbers as atomic tokens, it is better to parse them
221
    //   as a unary operator combined with a numeric literal.  This avoids x-1 being tokenized as
222
    //   <identifier> <number> instead of the intended <identifier> <binop> <number>.
223
    // * We support digit separators using the _ character for readability in
224
    //   large numeric literals.
225
226
13.4M
    enum State {
227
13.4M
        BEGIN,
228
13.4M
        AFTER_ZERO,
229
13.4M
        AFTER_ONE_TO_NINE,
230
13.4M
        AFTER_INT_UNDERSCORE,
231
13.4M
        AFTER_DOT,
232
13.4M
        AFTER_DIGIT,
233
13.4M
        AFTER_FRAC_UNDERSCORE,
234
13.4M
        AFTER_E,
235
13.4M
        AFTER_EXP_SIGN,
236
13.4M
        AFTER_EXP_DIGIT,
237
13.4M
        AFTER_EXP_UNDERSCORE
238
13.4M
    } state;
239
240
13.4M
    std::string r;
241
242
13.4M
    state = BEGIN;
243
30.7M
    while (true) {
244
30.7M
        switch (state) {
245
13.4M
            case BEGIN:
246
13.4M
                switch (*c) {
247
4.09M
                    case '0': state = AFTER_ZERO; break;
248
249
3.93M
                    case '1':
250
4.72M
                    case '2':
251
5.36M
                    case '3':
252
6.15M
                    case '4':
253
6.31M
                    case '5':
254
6.80M
                    case '6':
255
7.11M
                    case '7':
256
8.39M
                    case '8':
257
9.34M
                    case '9': state = AFTER_ONE_TO_NINE; break;
258
259
0
                    default: throw StaticError(filename, begin, "couldn't lex number");
260
13.4M
                }
261
13.4M
                break;
262
263
13.4M
            case AFTER_ZERO:
264
4.09M
                switch (*c) {
265
44.9k
                    case '.': state = AFTER_DOT; break;
266
267
1.34k
                    case 'e':
268
2.48k
                    case 'E': state = AFTER_E; break;
269
270
5
                    case '_': {
271
5
                        std::stringstream ss;
272
5
                        ss << "couldn't lex number, _ not allowed after leading 0";
273
5
                        throw StaticError(filename, begin, ss.str());
274
1.34k
                    }
275
276
4.04M
                    default: goto end;
277
4.09M
                }
278
47.3k
                break;
279
280
12.3M
            case AFTER_ONE_TO_NINE:
281
12.3M
                switch (*c) {
282
27.5k
                    case '.': state = AFTER_DOT; break;
283
284
2.66k
                    case 'e':
285
5.09k
                    case 'E': state = AFTER_E; break;
286
287
1.23M
                    case '0':
288
1.27M
                    case '1':
289
1.52M
                    case '2':
290
1.61M
                    case '3':
291
1.72M
                    case '4':
292
1.94M
                    case '5':
293
2.08M
                    case '6':
294
2.15M
                    case '7':
295
2.22M
                    case '8':
296
3.01M
                    case '9': state = AFTER_ONE_TO_NINE; break;
297
298
749
                    case '_': state = AFTER_INT_UNDERSCORE; goto skip_char;
299
300
9.30M
                    default: goto end;
301
12.3M
                }
302
3.04M
                break;
303
304
3.04M
            case AFTER_INT_UNDERSCORE:
305
749
                switch (*c) {
306
                    // The only valid transition from _ is to a digit.
307
386
                    case '0':
308
454
                    case '1':
309
478
                    case '2':
310
481
                    case '3':
311
489
                    case '4':
312
639
                    case '5':
313
640
                    case '6':
314
640
                    case '7':
315
729
                    case '8':
316
729
                    case '9': state = AFTER_ONE_TO_NINE; break;
317
318
20
                    default: {
319
20
                        std::stringstream ss;
320
20
                        ss << "couldn't lex number, junk after _: " << *c;
321
20
                        throw StaticError(filename, begin, ss.str());
322
729
                    }
323
749
                }
324
729
                break;
325
326
72.4k
            case AFTER_DOT:
327
72.4k
                switch (*c) {
328
2.26k
                    case '0':
329
25.1k
                    case '1':
330
26.0k
                    case '2':
331
27.0k
                    case '3':
332
27.4k
                    case '4':
333
70.3k
                    case '5':
334
70.9k
                    case '6':
335
71.0k
                    case '7':
336
71.7k
                    case '8':
337
72.4k
                    case '9': state = AFTER_DIGIT; break;
338
339
23
                    default: {
340
23
                        std::stringstream ss;
341
23
                        ss << "couldn't lex number, junk after decimal point: " << *c;
342
23
                        throw StaticError(filename, begin, ss.str());
343
71.7k
                    }
344
72.4k
                }
345
72.4k
                break;
346
347
581k
            case AFTER_DIGIT:
348
581k
                switch (*c) {
349
996
                    case 'e':
350
2.36k
                    case 'E': state = AFTER_E; break;
351
352
106k
                    case '0':
353
177k
                    case '1':
354
202k
                    case '2':
355
249k
                    case '3':
356
274k
                    case '4':
357
343k
                    case '5':
358
389k
                    case '6':
359
414k
                    case '7':
360
441k
                    case '8':
361
508k
                    case '9': state = AFTER_DIGIT; break;
362
363
744
                    case '_': state = AFTER_FRAC_UNDERSCORE; goto skip_char;
364
365
70.0k
                    default: goto end;
366
581k
                }
367
511k
                break;
368
369
511k
            case AFTER_FRAC_UNDERSCORE:
370
744
                switch (*c) {
371
                    // The only valid transition from _ is to a digit.
372
194
                    case '0':
373
231
                    case '1':
374
550
                    case '2':
375
552
                    case '3':
376
558
                    case '4':
377
600
                    case '5':
378
601
                    case '6':
379
601
                    case '7':
380
729
                    case '8':
381
729
                    case '9': state = AFTER_DIGIT; break;
382
383
15
                    default: {
384
15
                        std::stringstream ss;
385
15
                        ss << "couldn't lex number, junk after _: " << *c;
386
15
                        throw StaticError(filename, begin, ss.str());
387
729
                    }
388
744
                }
389
729
                break;
390
391
9.94k
            case AFTER_E:
392
9.94k
                switch (*c) {
393
1.61k
                    case '+':
394
3.57k
                    case '-': state = AFTER_EXP_SIGN; break;
395
396
1.72k
                    case '0':
397
2.63k
                    case '1':
398
3.96k
                    case '2':
399
4.37k
                    case '3':
400
4.47k
                    case '4':
401
5.02k
                    case '5':
402
5.56k
                    case '6':
403
5.64k
                    case '7':
404
5.82k
                    case '8':
405
6.30k
                    case '9': state = AFTER_EXP_DIGIT; break;
406
407
69
                    default: {
408
69
                        std::stringstream ss;
409
69
                        ss << "couldn't lex number, junk after 'E': " << *c;
410
69
                        throw StaticError(filename, begin, ss.str());
411
5.82k
                    }
412
9.94k
                }
413
9.88k
                break;
414
415
9.88k
            case AFTER_EXP_SIGN:
416
3.57k
                switch (*c) {
417
991
                    case '0':
418
1.42k
                    case '1':
419
1.74k
                    case '2':
420
2.93k
                    case '3':
421
3.27k
                    case '4':
422
3.28k
                    case '5':
423
3.33k
                    case '6':
424
3.37k
                    case '7':
425
3.47k
                    case '8':
426
3.55k
                    case '9': state = AFTER_EXP_DIGIT; break;
427
428
16
                    default: {
429
16
                        std::stringstream ss;
430
16
                        ss << "couldn't lex number, junk after exponent sign: " << *c;
431
16
                        throw StaticError(filename, begin, ss.str());
432
3.47k
                    }
433
3.57k
                }
434
3.55k
                break;
435
436
209k
            case AFTER_EXP_DIGIT:
437
209k
                switch (*c) {
438
166k
                    case '0':
439
171k
                    case '1':
440
174k
                    case '2':
441
177k
                    case '3':
442
182k
                    case '4':
443
184k
                    case '5':
444
187k
                    case '6':
445
191k
                    case '7':
446
196k
                    case '8':
447
199k
                    case '9': state = AFTER_EXP_DIGIT; break;
448
449
587
                    case '_': state = AFTER_EXP_UNDERSCORE; goto skip_char;
450
451
9.85k
                    default: goto end;
452
209k
                }
453
199k
                break;
454
455
199k
            case AFTER_EXP_UNDERSCORE:
456
587
                switch (*c) {
457
                    // The only valid transition from _ is to a digit.
458
131
                    case '0':
459
472
                    case '1':
460
476
                    case '2':
461
476
                    case '3':
462
484
                    case '4':
463
488
                    case '5':
464
501
                    case '6':
465
506
                    case '7':
466
562
                    case '8':
467
575
                    case '9': state = AFTER_EXP_DIGIT; break;
468
469
12
                    default: {
470
12
                        std::stringstream ss;
471
12
                        ss << "couldn't lex number, junk after _: " << *c;
472
12
                        throw StaticError(filename, begin, ss.str());
473
562
                    }
474
587
                }
475
575
                break;
476
30.7M
        }
477
17.3M
        r += *c;
478
479
17.3M
skip_char:
480
17.3M
        c++;
481
17.3M
    }
482
13.4M
end:
483
13.4M
    return r;
484
13.4M
}
485
486
// Check that b has at least the same whitespace prefix as a and returns the amount of this
487
// whitespace, otherwise returns 0.  If a has no whitespace prefix than return 0.
488
static int whitespace_check(const char *a, const char *b)
489
37.7k
{
490
37.7k
    int i = 0;
491
1.09M
    while (a[i] == ' ' || a[i] == '\t') {
492
1.07M
        if (b[i] != a[i])
493
15.8k
            return 0;
494
1.05M
        i++;
495
1.05M
    }
496
21.9k
    return i;
497
37.7k
}
498
499
230
static void describe_whitespace(std::stringstream& msg, const std::string& ws) {
500
230
    int spaces = 0;
501
230
    int tabs = 0;
502
489k
    for (char c : ws) {
503
489k
        if (c == ' ')
504
29.8k
            spaces++;
505
459k
        else if (c == '\t')
506
459k
            tabs++;
507
489k
    }
508
230
    if (spaces > 0 && tabs > 0) {
509
55
        msg << spaces << (spaces == 1 ? " space" : " spaces") << " and " << tabs
510
55
            << (tabs == 1 ? " tab" : " tabs");
511
175
    } else if (spaces > 0) {
512
73
        msg << spaces << (spaces == 1 ? " space" : " spaces");
513
102
    } else if (tabs > 0) {
514
102
        msg << tabs << (tabs == 1 ? " tab" : " tabs");
515
102
    } else {
516
0
        msg << "no indentation";
517
0
    }
518
230
}
519
520
Tokens jsonnet_lex(const std::string &filename, const char *input)
521
42.5k
{
522
42.5k
    unsigned long line_number = 1;
523
42.5k
    const char *line_start = input;
524
525
42.5k
    Tokens r;
526
527
42.5k
    const char *c = input;
528
529
42.5k
    Fodder fodder;
530
42.5k
    bool fresh_line = true;  // Are we tokenizing from the beginning of a new line?
531
532
304M
    while (*c != '\0') {
533
        // Used to ensure we have actually advanced the pointer by the end of the iteration.
534
304M
        const char *original_c = c;
535
536
304M
        Token::Kind kind;
537
304M
        std::string data;
538
304M
        std::string string_block_indent;
539
304M
        std::string string_block_term_indent;
540
541
304M
        unsigned new_lines, indent;
542
304M
        lex_ws(c, new_lines, indent, line_start, line_number);
543
544
        // If it's the end of the file, discard final whitespace.
545
304M
        if (*c == '\0')
546
21.9k
            break;
547
548
304M
        if (new_lines > 0) {
549
            // Otherwise store whitespace in fodder.
550
32.1M
            unsigned blanks = new_lines - 1;
551
32.1M
            fodder.emplace_back(FodderElement::LINE_END, blanks, indent, EMPTY);
552
32.1M
            fresh_line = true;
553
32.1M
        }
554
555
304M
        Location begin(line_number, c - line_start + 1);
556
557
304M
        switch (*c) {
558
            // The following operators should never be combined with subsequent symbols.
559
1.31M
            case '{':
560
1.31M
                kind = Token::BRACE_L;
561
1.31M
                c++;
562
1.31M
                break;
563
564
1.29M
            case '}':
565
1.29M
                kind = Token::BRACE_R;
566
1.29M
                c++;
567
1.29M
                break;
568
569
6.81M
            case '[':
570
6.81M
                kind = Token::BRACKET_L;
571
6.81M
                c++;
572
6.81M
                break;
573
574
6.77M
            case ']':
575
6.77M
                kind = Token::BRACKET_R;
576
6.77M
                c++;
577
6.77M
                break;
578
579
26.0M
            case ',':
580
26.0M
                kind = Token::COMMA;
581
26.0M
                c++;
582
26.0M
                break;
583
584
15.3M
            case '.':
585
15.3M
                kind = Token::DOT;
586
15.3M
                c++;
587
15.3M
                break;
588
589
23.1M
            case '(':
590
23.1M
                kind = Token::PAREN_L;
591
23.1M
                c++;
592
23.1M
                break;
593
594
23.1M
            case ')':
595
23.1M
                kind = Token::PAREN_R;
596
23.1M
                c++;
597
23.1M
                break;
598
599
6.06M
            case ';':
600
6.06M
                kind = Token::SEMICOLON;
601
6.06M
                c++;
602
6.06M
                break;
603
604
            // Numeric literals.
605
4.09M
            case '0':
606
8.02M
            case '1':
607
8.81M
            case '2':
608
9.46M
            case '3':
609
10.2M
            case '4':
610
10.4M
            case '5':
611
10.9M
            case '6':
612
11.2M
            case '7':
613
12.4M
            case '8':
614
13.4M
            case '9':
615
13.4M
                kind = Token::NUMBER;
616
13.4M
                data = lex_number(c, filename, begin);
617
13.4M
                break;
618
619
            // UString literals.
620
361k
            case '"': {
621
361k
                c++;
622
70.4M
                for (;; ++c) {
623
70.4M
                    if (*c == '\0') {
624
68
                        throw StaticError(filename, begin, "unterminated string");
625
68
                    }
626
70.4M
                    if (*c == '"') {
627
361k
                        break;
628
361k
                    }
629
70.0M
                    if (*c == '\\' && *(c + 1) != '\0') {
630
217k
                        data += *c;
631
217k
                        ++c;
632
217k
                    }
633
70.0M
                    if (*c == '\n') {
634
                        // Maintain line/column counters.
635
5.62M
                        line_number++;
636
5.62M
                        line_start = c + 1;
637
5.62M
                    }
638
70.0M
                    data += *c;
639
70.0M
                }
640
361k
                c++;  // Advance beyond the ".
641
361k
                kind = Token::STRING_DOUBLE;
642
361k
            } break;
643
644
            // UString literals.
645
10.0M
            case '\'': {
646
10.0M
                c++;
647
139M
                for (;; ++c) {
648
139M
                    if (*c == '\0') {
649
71
                        throw StaticError(filename, begin, "unterminated string");
650
71
                    }
651
139M
                    if (*c == '\'') {
652
10.0M
                        break;
653
10.0M
                    }
654
129M
                    if (*c == '\\' && *(c + 1) != '\0') {
655
1.11M
                        data += *c;
656
1.11M
                        ++c;
657
1.11M
                    }
658
129M
                    if (*c == '\n') {
659
                        // Maintain line/column counters.
660
4.46M
                        line_number++;
661
4.46M
                        line_start = c + 1;
662
4.46M
                    }
663
129M
                    data += *c;
664
129M
                }
665
10.0M
                c++;  // Advance beyond the '.
666
10.0M
                kind = Token::STRING_SINGLE;
667
10.0M
            } break;
668
669
            // Verbatim string literals.
670
            // ' and " quoting is interpreted here, unlike non-verbatim strings
671
            // where it is done later by jsonnet_string_unescape.  This is OK
672
            // in this case because no information is lost by resoving the
673
            // repeated quote into a single quote, so we can go back to the
674
            // original form in the formatter.
675
11.2k
            case '@': {
676
11.2k
                c++;
677
11.2k
                if (*c != '"' && *c != '\'') {
678
43
                    std::stringstream ss;
679
43
                    ss << "couldn't lex verbatim string, junk after '@': " << *c;
680
43
                    throw StaticError(filename, begin, ss.str());
681
43
                }
682
11.1k
                const char quot = *c;
683
11.1k
                c++;  // Advance beyond the opening quote.
684
210k
                for (;; ++c) {
685
210k
                    if (*c == '\0') {
686
74
                        throw StaticError(filename, begin, "unterminated verbatim string");
687
74
                    }
688
210k
                    if (*c == quot) {
689
14.0k
                        if (*(c + 1) == quot) {
690
2.95k
                            c++;
691
11.1k
                        } else {
692
11.1k
                            break;
693
11.1k
                        }
694
14.0k
                    }
695
199k
                    data += *c;
696
199k
                }
697
11.1k
                c++;  // Advance beyond the closing quote.
698
11.1k
                if (quot == '"') {
699
7.32k
                    kind = Token::VERBATIM_STRING_DOUBLE;
700
7.32k
                } else {
701
3.77k
                    kind = Token::VERBATIM_STRING_SINGLE;
702
3.77k
                }
703
11.1k
            } break;
704
705
            // Keywords
706
170M
            default:
707
170M
                if (is_identifier_first(*c)) {
708
128M
                    std::string id;
709
694M
                    for (; is_identifier(*c); ++c)
710
566M
                        id += *c;
711
128M
                    kind = lex_get_keyword_kind(id);
712
128M
                    data = id;
713
714
128M
                } else if (is_symbol(*c) || *c == '#') {
715
                    // Single line C++ and Python style comments.
716
41.8M
                    if (*c == '#' || (*c == '/' && *(c + 1) == '/')) {
717
8.23M
                        std::vector<std::string> comment(1);
718
8.23M
                        unsigned blanks;
719
8.23M
                        unsigned indent;
720
8.23M
                        lex_until_newline(c, comment[0], blanks, indent, line_start, line_number);
721
8.23M
                        auto kind = fresh_line ? FodderElement::PARAGRAPH : FodderElement::LINE_END;
722
8.23M
                        fodder.emplace_back(kind, blanks, indent, comment);
723
8.23M
                        fresh_line = true;
724
8.23M
                        continue;  // We've not got a token, just fodder, so keep scanning.
725
8.23M
                    }
726
727
                    // Multi-line C style comment.
728
33.6M
                    if (*c == '/' && *(c + 1) == '*') {
729
855k
                        unsigned margin = c - line_start;
730
731
855k
                        const char *initial_c = c;
732
855k
                        c += 2;  // Avoid matching /*/: skip the /* before starting the search for
733
                                 // */.
734
735
128M
                        while (!(*c == '*' && *(c + 1) == '/')) {
736
127M
                            if (*c == '\0') {
737
189
                                auto msg = "multi-line comment has no terminating */.";
738
189
                                throw StaticError(filename, begin, msg);
739
189
                            }
740
127M
                            if (*c == '\n') {
741
                                // Just keep track of the line / column counters.
742
19.2M
                                line_number++;
743
19.2M
                                line_start = c + 1;
744
19.2M
                            }
745
127M
                            ++c;
746
127M
                        }
747
855k
                        c += 2;  // Move the pointer to the char after the closing '/'.
748
749
855k
                        std::string comment(initial_c,
750
855k
                                            c - initial_c);  // Includes the "/*" and "*/".
751
752
                        // Lex whitespace after comment
753
855k
                        unsigned new_lines_after, indent_after;
754
855k
                        lex_ws(c, new_lines_after, indent_after, line_start, line_number);
755
855k
                        std::vector<std::string> lines;
756
855k
                        if (comment.find('\n') >= comment.length()) {
757
                            // Comment looks like /* foo */
758
535k
                            lines.push_back(comment);
759
535k
                            fodder.emplace_back(FodderElement::INTERSTITIAL, 0, 0, lines);
760
535k
                            if (new_lines_after > 0) {
761
521k
                                fodder.emplace_back(FodderElement::LINE_END,
762
521k
                                                    new_lines_after - 1,
763
521k
                                                    indent_after,
764
521k
                                                    EMPTY);
765
521k
                                fresh_line = true;
766
521k
                            }
767
535k
                        } else {
768
320k
                            lines = line_split(comment, margin);
769
320k
                            assert(lines[0][0] == '/');
770
                            // Little hack to support PARAGRAPHs with * down the LHS:
771
                            // Add a space to lines that start with a '*'
772
320k
                            bool all_star = true;
773
19.5M
                            for (auto &l : lines) {
774
19.5M
                                if (l[0] != '*')
775
19.3M
                                    all_star = false;
776
19.5M
                            }
777
320k
                            if (all_star) {
778
0
                                for (auto &l : lines) {
779
0
                                    if (l[0] == '*')
780
0
                                        l = " " + l;
781
0
                                }
782
0
                            }
783
320k
                            if (new_lines_after == 0) {
784
                                // Ensure a line end after the paragraph.
785
27.5k
                                new_lines_after = 1;
786
27.5k
                                indent_after = 0;
787
27.5k
                            }
788
320k
                            fodder_push_back(fodder,
789
320k
                                             FodderElement(FodderElement::PARAGRAPH,
790
320k
                                                           new_lines_after - 1,
791
320k
                                                           indent_after,
792
320k
                                                           lines));
793
320k
                            fresh_line = true;
794
320k
                        }
795
855k
                        continue;  // We've not got a token, just fodder, so keep scanning.
796
855k
                    }
797
798
                    // Text block
799
32.7M
                    if (*c == '|' && *(c + 1) == '|' && *(c + 2) == '|') {
800
16.2k
                        c += 3;  // Skip the "|||".
801
802
16.2k
                        bool chomp_trailing_nl = false;
803
16.2k
                        if (*c == '-') {
804
1.70k
                            chomp_trailing_nl = true;
805
1.70k
                            c++;
806
1.70k
                        }
807
808
20.5k
                        while (is_horz_ws(*c)) ++c;  // Chomp whitespace at end of line.
809
16.2k
                        if (*c != '\n') {
810
117
                            auto msg = "text block syntax requires new line after |||.";
811
117
                            throw StaticError(filename, begin, msg);
812
117
                        }
813
16.0k
                        std::stringstream block;
814
16.0k
                        c++;  // Skip the "\n"
815
16.0k
                        line_number++;
816
                        // Skip any blank lines at the beginning of the block.
817
19.9k
                        while (*c == '\n') {
818
3.84k
                            line_number++;
819
3.84k
                            ++c;
820
3.84k
                            block << '\n';
821
3.84k
                        }
822
16.0k
                        line_start = c;
823
16.0k
                        const char *first_line = c;
824
16.0k
                        int ws_chars = whitespace_check(first_line, c);
825
16.0k
                        string_block_indent = std::string(first_line, ws_chars);
826
16.0k
                        if (ws_chars == 0) {
827
64
                            auto msg = "text block's first line must start with whitespace.";
828
64
                            throw StaticError(filename, begin, msg);
829
64
                        }
830
21.8k
                        while (true) {
831
21.8k
                            assert(ws_chars > 0);
832
                            // Read up to the \n
833
9.70M
                            for (c = &c[ws_chars]; *c != '\n'; ++c) {
834
9.68M
                                if (*c == '\0')
835
184
                                    throw StaticError(filename, begin, "unexpected EOF");
836
9.68M
                                block << *c;
837
9.68M
                            }
838
                            // Add the \n
839
21.6k
                            block << '\n';
840
21.6k
                            ++c;
841
21.6k
                            line_number++;
842
21.6k
                            line_start = c;
843
                            // Skip any blank lines
844
25.1k
                            while (*c == '\n') {
845
3.43k
                                line_number++;
846
3.43k
                                ++c;
847
3.43k
                                block << '\n';
848
3.43k
                            }
849
                            // Examine next line
850
21.6k
                            ws_chars = whitespace_check(first_line, c);
851
21.6k
                            if (ws_chars == 0) {
852
                                // End of text block (or indentation error).
853
                                // Count actual whitespace on this line.
854
15.8k
                                int actual_ws = 0;
855
215k
                                while (c[actual_ws] == ' ' ||
856
199k
                                       c[actual_ws] == '\t') {
857
199k
                                    actual_ws++;
858
199k
                                }
859
860
                                // Check if this is the terminator |||
861
15.8k
                                bool is_terminator = (
862
15.8k
                                    c[actual_ws] == '|' &&
863
15.6k
                                    c[actual_ws + 1] == '|' &&
864
15.6k
                                    c[actual_ws + 2] == '|');
865
866
15.8k
                                if (!is_terminator) {
867
                                    // Not a terminator - check if it's an
868
                                    // indentation issue.
869
244
                                    if (actual_ws > 0) {
870
                                        // Has whitespace but doesn't match expected
871
                                        // indentation.
872
115
                                        std::stringstream msg;
873
115
                                        msg << "text block indentation mismatch: "
874
115
                                                "expected at least ";
875
115
                                        describe_whitespace(msg, string_block_indent);
876
115
                                        msg << ", found ";
877
115
                                        describe_whitespace(msg, std::string(c, actual_ws));
878
115
                                        throw StaticError(filename, begin, msg.str());
879
129
                                    } else {
880
                                        // No whitespace and no ||| - missing
881
                                        // terminator.
882
129
                                        auto msg =
883
129
                                            "text block not terminated with |||";
884
129
                                        throw StaticError(filename, begin, msg);
885
129
                                    }
886
244
                                }
887
888
                                // Valid termination - skip over any whitespace.
889
107k
                                while (*c == ' ' || *c == '\t') {
890
92.0k
                                    string_block_term_indent += *c;
891
92.0k
                                    ++c;
892
92.0k
                                }
893
                                // Skip the |||
894
15.6k
                                c += 3;  // Leave after the last |
895
15.6k
                                data = block.str();
896
15.6k
                                kind = Token::STRING_BLOCK;
897
15.6k
                                if (chomp_trailing_nl) {
898
1.66k
                                    assert(data.back() == '\n');
899
1.66k
                                    data.pop_back();
900
1.66k
                                }
901
15.6k
                                break;  // Out of the while loop.
902
15.6k
                            }
903
21.6k
                        }
904
905
15.6k
                        break;  // Out of the switch.
906
16.0k
                    }
907
908
32.7M
                    const char *operator_begin = c;
909
103M
                    for (; is_symbol(*c); ++c) {
910
                        // Not allowed // in operators
911
71.1M
                        if (*c == '/' && *(c + 1) == '/')
912
1.30k
                            break;
913
                        // Not allowed /* in operators
914
71.1M
                        if (*c == '/' && *(c + 1) == '*')
915
1.45k
                            break;
916
                        // Not allowed ||| in operators
917
71.1M
                        if (*c == '|' && *(c + 1) == '|' && *(c + 2) == '|')
918
2.26k
                            break;
919
71.1M
                    }
920
                    // Not allowed to end with a + - ~ ! unless a single char.
921
                    // So, wind it back if we need to (but not too far).
922
38.4M
                    while (c > operator_begin + 1 && !allowed_at_end_of_operator(*(c - 1))) {
923
5.75M
                        c--;
924
5.75M
                    }
925
32.7M
                    data += std::string(operator_begin, c);
926
32.7M
                    if (data == "$") {
927
114k
                        kind = Token::DOLLAR;
928
114k
                        data = "";
929
32.6M
                    } else {
930
32.6M
                        kind = Token::OPERATOR;
931
32.6M
                    }
932
32.7M
                } else {
933
321
                    std::stringstream ss;
934
321
                    ss << "Could not lex the character ";
935
321
                    auto uc = (unsigned char)(*c);
936
321
                    if (*c < 32)
937
289
                        ss << "code " << unsigned(uc);
938
32
                    else
939
32
                        ss << "'" << *c << "'";
940
321
                    throw StaticError(filename, begin, ss.str());
941
321
                }
942
304M
        }
943
944
        // Ensure that a bug in the above code does not cause an infinite memory consuming loop due
945
        // to pushing empty tokens.
946
294M
        if (c == original_c) {
947
0
            throw StaticError(filename, begin, "internal lexing error:  pointer did not advance");
948
0
        }
949
950
294M
        Location end(line_number, (c + 1) - line_start);
951
294M
        r.emplace_back(kind,
952
294M
                       fodder,
953
294M
                       data,
954
294M
                       string_block_indent,
955
294M
                       string_block_term_indent,
956
294M
                       LocationRange(filename, begin, end));
957
294M
        fodder.clear();
958
294M
        fresh_line = false;
959
294M
    }
960
961
41.0k
    Location begin(line_number, c - line_start + 1);
962
41.0k
    Location end(line_number, (c + 1) - line_start + 1);
963
41.0k
    r.emplace_back(Token::END_OF_FILE, fodder, "", "", "", LocationRange(filename, begin, end));
964
41.0k
    return r;
965
42.5k
}
966
967
std::string jsonnet_unlex(const Tokens &tokens)
968
0
{
969
0
    std::stringstream ss;
970
0
    for (const auto &t : tokens) {
971
0
        for (const auto &f : t.fodder) {
972
0
            switch (f.kind) {
973
0
                case FodderElement::LINE_END: {
974
0
                    if (f.comment.size() > 0) {
975
0
                        ss << "LineEnd(" << f.blanks << ", " << f.indent << ", " << f.comment[0]
976
0
                           << ")\n";
977
0
                    } else {
978
0
                        ss << "LineEnd(" << f.blanks << ", " << f.indent << ")\n";
979
0
                    }
980
0
                } break;
981
982
0
                case FodderElement::INTERSTITIAL: {
983
0
                    ss << "Interstitial(" << f.comment[0] << ")\n";
984
0
                } break;
985
986
0
                case FodderElement::PARAGRAPH: {
987
0
                    ss << "Paragraph(\n";
988
0
                    for (const auto &line : f.comment) {
989
0
                        ss << "    " << line << '\n';
990
0
                    }
991
0
                    ss << ")" << f.blanks << "\n";
992
0
                } break;
993
0
            }
994
0
        }
995
0
        if (t.kind == Token::END_OF_FILE) {
996
0
            ss << "EOF\n";
997
0
            break;
998
0
        }
999
0
        if (t.kind == Token::STRING_DOUBLE) {
1000
0
            ss << "\"" << t.data << "\"\n";
1001
0
        } else if (t.kind == Token::STRING_SINGLE) {
1002
0
            ss << "'" << t.data << "'\n";
1003
0
        } else if (t.kind == Token::STRING_BLOCK) {
1004
0
            ss << "|||\n";
1005
0
            ss << t.stringBlockIndent;
1006
0
            for (const char *cp = t.data.c_str(); *cp != '\0'; ++cp) {
1007
0
                ss << *cp;
1008
0
                if (*cp == '\n' && *(cp + 1) != '\n' && *(cp + 1) != '\0') {
1009
0
                    ss << t.stringBlockIndent;
1010
0
                }
1011
0
            }
1012
0
            ss << t.stringBlockTermIndent << "|||\n";
1013
0
        } else {
1014
0
            ss << t.data << "\n";
1015
0
        }
1016
0
    }
1017
0
    return ss.str();
1018
0
}
1019
1020
}  // namespace jsonnet::internal