Coverage Report

Created: 2025-11-11 06:57

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/jsonnet/core/lexer.cpp
Line
Count
Source
1
/*
2
Copyright 2015 Google Inc. All rights reserved.
3
4
Licensed under the Apache License, Version 2.0 (the "License");
5
you may not use this file except in compliance with the License.
6
You may obtain a copy of the License at
7
8
    http://www.apache.org/licenses/LICENSE-2.0
9
10
Unless required by applicable law or agreed to in writing, software
11
distributed under the License is distributed on an "AS IS" BASIS,
12
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
See the License for the specific language governing permissions and
14
limitations under the License.
15
*/
16
17
#include <cassert>
18
19
#include <map>
20
#include <sstream>
21
#include <string>
22
23
#include "lexer.h"
24
#include "static_error.h"
25
#include "unicode.h"
26
27
namespace jsonnet::internal {
28
29
static const std::vector<std::string> EMPTY;
30
31
/** Is the char whitespace (excluding \n). */
32
static bool is_horz_ws(char c)
33
920M
{
34
920M
    return c == ' ' || c == '\t' || c == '\r';
35
920M
}
36
37
/** Is the char whitespace. */
38
static bool is_ws(char c)
39
828M
{
40
828M
    return c == '\n' || is_horz_ws(c);
41
828M
}
42
43
/** Strip whitespace from both ends of a string, but only up to margin on the left hand side. */
44
static std::string strip_ws(const std::string &s, unsigned margin)
45
19.7M
{
46
19.7M
    if (s.size() == 0)
47
15.3M
        return s;  // Avoid underflow below.
48
4.45M
    size_t i = 0;
49
10.2M
    while (i < s.length() && is_horz_ws(s[i]) && i < margin)
50
5.82M
        i++;
51
4.45M
    size_t j = s.size();
52
8.15M
    while (j > i && is_horz_ws(s[j - 1])) {
53
3.69M
        j--;
54
3.69M
    }
55
4.45M
    return std::string(&s[i], &s[j]);
56
19.7M
}
57
58
/** Split a string by \n and also strip left (up to margin) & right whitespace from each line. */
59
static std::vector<std::string> line_split(const std::string &s, unsigned margin)
60
428k
{
61
428k
    std::vector<std::string> ret;
62
428k
    std::stringstream ss;
63
132M
    for (size_t i = 0; i < s.length(); ++i) {
64
132M
        if (s[i] == '\n') {
65
19.3M
            ret.emplace_back(strip_ws(ss.str(), margin));
66
19.3M
            ss.str("");
67
113M
        } else {
68
113M
            ss << s[i];
69
113M
        }
70
132M
    }
71
428k
    ret.emplace_back(strip_ws(ss.str(), margin));
72
428k
    return ret;
73
428k
}
74
75
/** Consume whitespace.
76
 *
77
 * Return number of \n and number of spaces after last \n.  Convert \t to spaces.
78
 */
79
static void lex_ws(const char *&c, unsigned &new_lines, unsigned &indent, const char *&line_start,
80
                   unsigned long &line_number)
81
365M
{
82
365M
    indent = 0;
83
365M
    new_lines = 0;
84
828M
    for (; *c != '\0' && is_ws(*c); c++) {
85
463M
        switch (*c) {
86
957k
            case '\r':
87
                // Ignore.
88
957k
                break;
89
90
54.3M
            case '\n':
91
54.3M
                indent = 0;
92
54.3M
                new_lines++;
93
54.3M
                line_number++;
94
54.3M
                line_start = c + 1;
95
54.3M
                break;
96
97
408M
            case ' ': indent += 1; break;
98
99
            // This only works for \t at the beginning of lines, but we strip it everywhere else
100
            // anyway.  The only case where this will cause a problem is spaces followed by \t
101
            // at the beginning of a line.  However that is rare, ill-advised, and if re-indentation
102
            // is enabled it will be fixed later.
103
60.4k
            case '\t': indent += 8; break;
104
463M
        }
105
463M
    }
106
365M
}
107
108
/**
109
# Consume all text until the end of the line, return number of newlines after that and indent
110
*/
111
static void lex_until_newline(const char *&c, std::string &text, unsigned &blanks, unsigned &indent,
112
                              const char *&line_start, unsigned long &line_number)
113
11.4M
{
114
11.4M
    const char *original_c = c;
115
11.4M
    const char *last_non_space = c;
116
139M
    for (; *c != '\0' && *c != '\n'; c++) {
117
128M
        if (!is_horz_ws(*c))
118
110M
            last_non_space = c;
119
128M
    }
120
11.4M
    text = std::string(original_c, last_non_space - original_c + 1);
121
    // Consume subsequent whitespace including the '\n'.
122
11.4M
    unsigned new_lines;
123
11.4M
    lex_ws(c, new_lines, indent, line_start, line_number);
124
11.4M
    blanks = new_lines == 0 ? 0 : new_lines - 1;
125
11.4M
}
126
127
static bool is_upper(char c)
128
984M
{
129
984M
    return c >= 'A' && c <= 'Z';
130
984M
}
131
132
static bool is_lower(char c)
133
967M
{
134
967M
    return c >= 'a' && c <= 'z';
135
967M
}
136
137
static bool is_number(char c)
138
173M
{
139
173M
    return c >= '0' && c <= '9';
140
173M
}
141
142
static bool is_identifier_first(char c)
143
984M
{
144
984M
    return is_upper(c) || is_lower(c) || c == '_';
145
984M
}
146
147
static bool is_identifier(char c)
148
790M
{
149
790M
    return is_identifier_first(c) || is_number(c);
150
790M
}
151
152
static bool is_symbol(char c)
153
170M
{
154
170M
    switch (c) {
155
3.76M
        case '!':
156
4.26M
        case '$':
157
21.9M
        case ':':
158
23.1M
        case '~':
159
51.5M
        case '+':
160
54.8M
        case '-':
161
58.0M
        case '&':
162
59.8M
        case '|':
163
59.8M
        case '^':
164
93.8M
        case '=':
165
96.8M
        case '<':
166
100M
        case '>':
167
118M
        case '*':
168
122M
        case '/':
169
124M
        case '%': return true;
170
170M
    }
171
46.1M
    return false;
172
170M
}
173
174
22.2M
bool allowed_at_end_of_operator(char c) {
175
22.2M
    switch (c) {
176
9.04M
        case '+':
177
9.64M
        case '-':
178
10.8M
        case '~':
179
11.6M
        case '!':
180
11.9M
        case '$': return false;
181
22.2M
    }
182
10.3M
    return true;
183
22.2M
}
184
185
static const std::map<std::string, Token::Kind> keywords = {
186
    {"assert", Token::ASSERT},
187
    {"else", Token::ELSE},
188
    {"error", Token::ERROR},
189
    {"false", Token::FALSE},
190
    {"for", Token::FOR},
191
    {"function", Token::FUNCTION},
192
    {"if", Token::IF},
193
    {"import", Token::IMPORT},
194
    {"importstr", Token::IMPORTSTR},
195
    {"importbin", Token::IMPORTBIN},
196
    {"in", Token::IN},
197
    {"local", Token::LOCAL},
198
    {"null", Token::NULL_LIT},
199
    {"self", Token::SELF},
200
    {"super", Token::SUPER},
201
    {"tailstrict", Token::TAILSTRICT},
202
    {"then", Token::THEN},
203
    {"true", Token::TRUE},
204
};
205
206
Token::Kind lex_get_keyword_kind(const std::string &identifier)
207
143M
{
208
143M
    auto it = keywords.find(identifier);
209
143M
    if (it == keywords.end())
210
105M
        return Token::IDENTIFIER;
211
38.3M
    return it->second;
212
143M
}
213
214
std::string lex_number(const char *&c, const std::string &filename, const Location &begin)
215
21.4M
{
216
    // This function should be understood with reference to the linked image:
217
    // https://www.json.org/img/number.png
218
219
    // Note, we deviate from the json.org documentation as follows:
220
    // There is no reason to lex negative numbers as atomic tokens, it is better to parse them
221
    // as a unary operator combined with a numeric literal.  This avoids x-1 being tokenized as
222
    // <identifier> <number> instead of the intended <identifier> <binop> <number>.
223
224
21.4M
    enum State {
225
21.4M
        BEGIN,
226
21.4M
        AFTER_ZERO,
227
21.4M
        AFTER_ONE_TO_NINE,
228
21.4M
        AFTER_DOT,
229
21.4M
        AFTER_DIGIT,
230
21.4M
        AFTER_E,
231
21.4M
        AFTER_EXP_SIGN,
232
21.4M
        AFTER_EXP_DIGIT
233
21.4M
    } state;
234
235
21.4M
    std::string r;
236
237
21.4M
    state = BEGIN;
238
48.5M
    while (true) {
239
48.5M
        switch (state) {
240
21.4M
            case BEGIN:
241
21.4M
                switch (*c) {
242
10.3M
                    case '0': state = AFTER_ZERO; break;
243
244
4.47M
                    case '1':
245
5.35M
                    case '2':
246
6.16M
                    case '3':
247
7.23M
                    case '4':
248
7.44M
                    case '5':
249
8.28M
                    case '6':
250
8.67M
                    case '7':
251
10.1M
                    case '8':
252
11.1M
                    case '9': state = AFTER_ONE_TO_NINE; break;
253
254
0
                    default: throw StaticError(filename, begin, "couldn't lex number");
255
21.4M
                }
256
21.4M
                break;
257
258
21.4M
            case AFTER_ZERO:
259
10.3M
                switch (*c) {
260
53.1k
                    case '.': state = AFTER_DOT; break;
261
262
782
                    case 'e':
263
4.63k
                    case 'E': state = AFTER_E; break;
264
265
10.2M
                    default: goto end;
266
10.3M
                }
267
57.7k
                break;
268
269
14.1M
            case AFTER_ONE_TO_NINE:
270
14.1M
                switch (*c) {
271
36.7k
                    case '.': state = AFTER_DOT; break;
272
273
2.90k
                    case 'e':
274
275k
                    case 'E': state = AFTER_E; break;
275
276
1.08M
                    case '0':
277
1.12M
                    case '1':
278
1.41M
                    case '2':
279
1.51M
                    case '3':
280
1.62M
                    case '4':
281
1.87M
                    case '5':
282
2.02M
                    case '6':
283
2.10M
                    case '7':
284
2.18M
                    case '8':
285
2.99M
                    case '9': state = AFTER_ONE_TO_NINE; break;
286
287
10.8M
                    default: goto end;
288
14.1M
                }
289
3.30M
                break;
290
291
3.30M
            case AFTER_DOT:
292
89.8k
                switch (*c) {
293
3.73k
                    case '0':
294
30.3k
                    case '1':
295
31.8k
                    case '2':
296
33.6k
                    case '3':
297
35.2k
                    case '4':
298
84.5k
                    case '5':
299
85.5k
                    case '6':
300
86.2k
                    case '7':
301
88.5k
                    case '8':
302
89.8k
                    case '9': state = AFTER_DIGIT; break;
303
304
26
                    default: {
305
26
                        std::stringstream ss;
306
26
                        ss << "couldn't lex number, junk after decimal point: " << *c;
307
26
                        throw StaticError(filename, begin, ss.str());
308
88.5k
                    }
309
89.8k
                }
310
89.8k
                break;
311
312
1.66M
            case AFTER_DIGIT:
313
1.66M
                switch (*c) {
314
1.95k
                    case 'e':
315
3.08k
                    case 'E': state = AFTER_E; break;
316
317
1.10M
                    case '0':
318
1.18M
                    case '1':
319
1.20M
                    case '2':
320
1.26M
                    case '3':
321
1.29M
                    case '4':
322
1.37M
                    case '5':
323
1.43M
                    case '6':
324
1.46M
                    case '7':
325
1.50M
                    case '8':
326
1.57M
                    case '9': state = AFTER_DIGIT; break;
327
328
86.7k
                    default: goto end;
329
1.66M
                }
330
1.58M
                break;
331
332
1.58M
            case AFTER_E:
333
283k
                switch (*c) {
334
1.96k
                    case '+':
335
3.46k
                    case '-': state = AFTER_EXP_SIGN; break;
336
337
3.09k
                    case '0':
338
5.78k
                    case '1':
339
275k
                    case '2':
340
277k
                    case '3':
341
277k
                    case '4':
342
278k
                    case '5':
343
278k
                    case '6':
344
278k
                    case '7':
345
279k
                    case '8':
346
279k
                    case '9': state = AFTER_EXP_DIGIT; break;
347
348
86
                    default: {
349
86
                        std::stringstream ss;
350
86
                        ss << "couldn't lex number, junk after 'E': " << *c;
351
86
                        throw StaticError(filename, begin, ss.str());
352
279k
                    }
353
283k
                }
354
283k
                break;
355
356
283k
            case AFTER_EXP_SIGN:
357
3.46k
                switch (*c) {
358
596
                    case '0':
359
1.05k
                    case '1':
360
1.15k
                    case '2':
361
2.96k
                    case '3':
362
3.32k
                    case '4':
363
3.33k
                    case '5':
364
3.33k
                    case '6':
365
3.34k
                    case '7':
366
3.35k
                    case '8':
367
3.44k
                    case '9': state = AFTER_EXP_DIGIT; break;
368
369
18
                    default: {
370
18
                        std::stringstream ss;
371
18
                        ss << "couldn't lex number, junk after exponent sign: " << *c;
372
18
                        throw StaticError(filename, begin, ss.str());
373
3.35k
                    }
374
3.46k
                }
375
3.44k
                break;
376
377
568k
            case AFTER_EXP_DIGIT:
378
568k
                switch (*c) {
379
4.96k
                    case '0':
380
7.33k
                    case '1':
381
8.75k
                    case '2':
382
276k
                    case '3':
383
278k
                    case '4':
384
279k
                    case '5':
385
279k
                    case '6':
386
282k
                    case '7':
387
283k
                    case '8':
388
285k
                    case '9': state = AFTER_EXP_DIGIT; break;
389
390
283k
                    default: goto end;
391
568k
                }
392
285k
                break;
393
48.5M
        }
394
27.0M
        r += *c;
395
27.0M
        c++;
396
27.0M
    }
397
21.4M
end:
398
21.4M
    return r;
399
21.4M
}
400
401
// Check that b has at least the same whitespace prefix as a and returns the amount of this
402
// whitespace, otherwise returns 0.  If a has no whitespace prefix than return 0.
403
static int whitespace_check(const char *a, const char *b)
404
39.4k
{
405
39.4k
    int i = 0;
406
706k
    while (a[i] == ' ' || a[i] == '\t') {
407
684k
        if (b[i] != a[i])
408
17.2k
            return 0;
409
666k
        i++;
410
666k
    }
411
22.2k
    return i;
412
39.4k
}
413
414
/*
415
static void add_whitespace(Fodder &fodder, const char *s, size_t n)
416
{
417
    std::string ws(s, n);
418
    if (fodder.size() == 0 || fodder.back().kind != FodderElement::WHITESPACE) {
419
        fodder.emplace_back(FodderElement::WHITESPACE, ws);
420
    } else {
421
        fodder.back().data += ws;
422
    }
423
}
424
*/
425
426
Tokens jsonnet_lex(const std::string &filename, const char *input)
427
47.7k
{
428
47.7k
    unsigned long line_number = 1;
429
47.7k
    const char *line_start = input;
430
431
47.7k
    Tokens r;
432
433
47.7k
    const char *c = input;
434
435
47.7k
    Fodder fodder;
436
47.7k
    bool fresh_line = true;  // Are we tokenizing from the beginning of a new line?
437
438
352M
    while (*c != '\0') {
439
        // Used to ensure we have actually advanced the pointer by the end of the iteration.
440
352M
        const char *original_c = c;
441
442
352M
        Token::Kind kind;
443
352M
        std::string data;
444
352M
        std::string string_block_indent;
445
352M
        std::string string_block_term_indent;
446
447
352M
        unsigned new_lines, indent;
448
352M
        lex_ws(c, new_lines, indent, line_start, line_number);
449
450
        // If it's the end of the file, discard final whitespace.
451
352M
        if (*c == '\0')
452
24.6k
            break;
453
454
352M
        if (new_lines > 0) {
455
            // Otherwise store whitespace in fodder.
456
35.9M
            unsigned blanks = new_lines - 1;
457
35.9M
            fodder.emplace_back(FodderElement::LINE_END, blanks, indent, EMPTY);
458
35.9M
            fresh_line = true;
459
35.9M
        }
460
461
352M
        Location begin(line_number, c - line_start + 1);
462
463
352M
        switch (*c) {
464
            // The following operators should never be combined with subsequent symbols.
465
1.52M
            case '{':
466
1.52M
                kind = Token::BRACE_L;
467
1.52M
                c++;
468
1.52M
                break;
469
470
1.50M
            case '}':
471
1.50M
                kind = Token::BRACE_R;
472
1.50M
                c++;
473
1.50M
                break;
474
475
7.66M
            case '[':
476
7.66M
                kind = Token::BRACKET_L;
477
7.66M
                c++;
478
7.66M
                break;
479
480
7.63M
            case ']':
481
7.63M
                kind = Token::BRACKET_R;
482
7.63M
                c++;
483
7.63M
                break;
484
485
30.9M
            case ',':
486
30.9M
                kind = Token::COMMA;
487
30.9M
                c++;
488
30.9M
                break;
489
490
17.2M
            case '.':
491
17.2M
                kind = Token::DOT;
492
17.2M
                c++;
493
17.2M
                break;
494
495
26.0M
            case '(':
496
26.0M
                kind = Token::PAREN_L;
497
26.0M
                c++;
498
26.0M
                break;
499
500
25.9M
            case ')':
501
25.9M
                kind = Token::PAREN_R;
502
25.9M
                c++;
503
25.9M
                break;
504
505
6.76M
            case ';':
506
6.76M
                kind = Token::SEMICOLON;
507
6.76M
                c++;
508
6.76M
                break;
509
510
            // Numeric literals.
511
10.3M
            case '0':
512
14.7M
            case '1':
513
15.6M
            case '2':
514
16.4M
            case '3':
515
17.5M
            case '4':
516
17.7M
            case '5':
517
18.5M
            case '6':
518
18.9M
            case '7':
519
20.4M
            case '8':
520
21.4M
            case '9':
521
21.4M
                kind = Token::NUMBER;
522
21.4M
                data = lex_number(c, filename, begin);
523
21.4M
                break;
524
525
            // UString literals.
526
416k
            case '"': {
527
416k
                c++;
528
87.0M
                for (;; ++c) {
529
87.0M
                    if (*c == '\0') {
530
95
                        throw StaticError(filename, begin, "unterminated string");
531
95
                    }
532
87.0M
                    if (*c == '"') {
533
416k
                        break;
534
416k
                    }
535
86.6M
                    if (*c == '\\' && *(c + 1) != '\0') {
536
241k
                        data += *c;
537
241k
                        ++c;
538
241k
                    }
539
86.6M
                    if (*c == '\n') {
540
                        // Maintain line/column counters.
541
7.12M
                        line_number++;
542
7.12M
                        line_start = c + 1;
543
7.12M
                    }
544
86.6M
                    data += *c;
545
86.6M
                }
546
416k
                c++;  // Advance beyond the ".
547
416k
                kind = Token::STRING_DOUBLE;
548
416k
            } break;
549
550
            // UString literals.
551
11.3M
            case '\'': {
552
11.3M
                c++;
553
149M
                for (;; ++c) {
554
149M
                    if (*c == '\0') {
555
82
                        throw StaticError(filename, begin, "unterminated string");
556
82
                    }
557
149M
                    if (*c == '\'') {
558
11.3M
                        break;
559
11.3M
                    }
560
137M
                    if (*c == '\\' && *(c + 1) != '\0') {
561
1.24M
                        data += *c;
562
1.24M
                        ++c;
563
1.24M
                    }
564
137M
                    if (*c == '\n') {
565
                        // Maintain line/column counters.
566
3.64M
                        line_number++;
567
3.64M
                        line_start = c + 1;
568
3.64M
                    }
569
137M
                    data += *c;
570
137M
                }
571
11.3M
                c++;  // Advance beyond the '.
572
11.3M
                kind = Token::STRING_SINGLE;
573
11.3M
            } break;
574
575
            // Verbatim string literals.
576
            // ' and " quoting is interpreted here, unlike non-verbatim strings
577
            // where it is done later by jsonnet_string_unescape.  This is OK
578
            // in this case because no information is lost by resoving the
579
            // repeated quote into a single quote, so we can go back to the
580
            // original form in the formatter.
581
10.4k
            case '@': {
582
10.4k
                c++;
583
10.4k
                if (*c != '"' && *c != '\'') {
584
36
                    std::stringstream ss;
585
36
                    ss << "couldn't lex verbatim string, junk after '@': " << *c;
586
36
                    throw StaticError(filename, begin, ss.str());
587
36
                }
588
10.3k
                const char quot = *c;
589
10.3k
                c++;  // Advance beyond the opening quote.
590
415k
                for (;; ++c) {
591
415k
                    if (*c == '\0') {
592
74
                        throw StaticError(filename, begin, "unterminated verbatim string");
593
74
                    }
594
414k
                    if (*c == quot) {
595
13.1k
                        if (*(c + 1) == quot) {
596
2.81k
                            c++;
597
10.3k
                        } else {
598
10.3k
                            break;
599
10.3k
                        }
600
13.1k
                    }
601
404k
                    data += *c;
602
404k
                }
603
10.3k
                c++;  // Advance beyond the closing quote.
604
10.3k
                if (quot == '"') {
605
6.75k
                    kind = Token::VERBATIM_STRING_DOUBLE;
606
6.75k
                } else {
607
3.56k
                    kind = Token::VERBATIM_STRING_SINGLE;
608
3.56k
                }
609
10.3k
            } break;
610
611
            // Keywords
612
193M
            default:
613
193M
                if (is_identifier_first(*c)) {
614
143M
                    std::string id;
615
790M
                    for (; is_identifier(*c); ++c)
616
646M
                        id += *c;
617
143M
                    kind = lex_get_keyword_kind(id);
618
143M
                    data = id;
619
620
143M
                } else if (is_symbol(*c) || *c == '#') {
621
                    // Single line C++ and Python style comments.
622
49.9M
                    if (*c == '#' || (*c == '/' && *(c + 1) == '/')) {
623
11.4M
                        std::vector<std::string> comment(1);
624
11.4M
                        unsigned blanks;
625
11.4M
                        unsigned indent;
626
11.4M
                        lex_until_newline(c, comment[0], blanks, indent, line_start, line_number);
627
11.4M
                        auto kind = fresh_line ? FodderElement::PARAGRAPH : FodderElement::LINE_END;
628
11.4M
                        fodder.emplace_back(kind, blanks, indent, comment);
629
11.4M
                        fresh_line = true;
630
11.4M
                        continue;  // We've not got a token, just fodder, so keep scanning.
631
11.4M
                    }
632
633
                    // Multi-line C style comment.
634
38.5M
                    if (*c == '/' && *(c + 1) == '*') {
635
1.51M
                        unsigned margin = c - line_start;
636
637
1.51M
                        const char *initial_c = c;
638
1.51M
                        c += 2;  // Avoid matching /*/: skip the /* before starting the search for
639
                                 // */.
640
641
138M
                        while (!(*c == '*' && *(c + 1) == '/')) {
642
136M
                            if (*c == '\0') {
643
218
                                auto msg = "multi-line comment has no terminating */.";
644
218
                                throw StaticError(filename, begin, msg);
645
218
                            }
646
136M
                            if (*c == '\n') {
647
                                // Just keep track of the line / column counters.
648
19.3M
                                line_number++;
649
19.3M
                                line_start = c + 1;
650
19.3M
                            }
651
136M
                            ++c;
652
136M
                        }
653
1.51M
                        c += 2;  // Move the pointer to the char after the closing '/'.
654
655
1.51M
                        std::string comment(initial_c,
656
1.51M
                                            c - initial_c);  // Includes the "/*" and "*/".
657
658
                        // Lex whitespace after comment
659
1.51M
                        unsigned new_lines_after, indent_after;
660
1.51M
                        lex_ws(c, new_lines_after, indent_after, line_start, line_number);
661
1.51M
                        std::vector<std::string> lines;
662
1.51M
                        if (comment.find('\n') >= comment.length()) {
663
                            // Comment looks like /* foo */
664
1.08M
                            lines.push_back(comment);
665
1.08M
                            fodder.emplace_back(FodderElement::INTERSTITIAL, 0, 0, lines);
666
1.08M
                            if (new_lines_after > 0) {
667
937k
                                fodder.emplace_back(FodderElement::LINE_END,
668
937k
                                                    new_lines_after - 1,
669
937k
                                                    indent_after,
670
937k
                                                    EMPTY);
671
937k
                                fresh_line = true;
672
937k
                            }
673
1.08M
                        } else {
674
428k
                            lines = line_split(comment, margin);
675
428k
                            assert(lines[0][0] == '/');
676
                            // Little hack to support PARAGRAPHs with * down the LHS:
677
                            // Add a space to lines that start with a '*'
678
428k
                            bool all_star = true;
679
19.7M
                            for (auto &l : lines) {
680
19.7M
                                if (l[0] != '*')
681
19.5M
                                    all_star = false;
682
19.7M
                            }
683
428k
                            if (all_star) {
684
0
                                for (auto &l : lines) {
685
0
                                    if (l[0] == '*')
686
0
                                        l = " " + l;
687
0
                                }
688
0
                            }
689
428k
                            if (new_lines_after == 0) {
690
                                // Ensure a line end after the paragraph.
691
27.3k
                                new_lines_after = 1;
692
27.3k
                                indent_after = 0;
693
27.3k
                            }
694
428k
                            fodder_push_back(fodder,
695
428k
                                             FodderElement(FodderElement::PARAGRAPH,
696
428k
                                                           new_lines_after - 1,
697
428k
                                                           indent_after,
698
428k
                                                           lines));
699
428k
                            fresh_line = true;
700
428k
                        }
701
1.51M
                        continue;  // We've not got a token, just fodder, so keep scanning.
702
1.51M
                    }
703
704
                    // Text block
705
37.0M
                    if (*c == '|' && *(c + 1) == '|' && *(c + 2) == '|') {
706
17.6k
                        c += 3;  // Skip the "|||".
707
708
17.6k
                        bool chomp_trailing_nl = false;
709
17.6k
                        if (*c == '-') {
710
1.03k
                            chomp_trailing_nl = true;
711
1.03k
                            c++;
712
1.03k
                        }
713
714
21.1k
                        while (is_horz_ws(*c)) ++c;  // Chomp whitespace at end of line.
715
17.6k
                        if (*c != '\n') {
716
108
                            auto msg = "text block syntax requires new line after |||.";
717
108
                            throw StaticError(filename, begin, msg);
718
108
                        }
719
17.5k
                        std::stringstream block;
720
17.5k
                        c++;  // Skip the "\n"
721
17.5k
                        line_number++;
722
                        // Skip any blank lines at the beginning of the block.
723
22.8k
                        while (*c == '\n') {
724
5.37k
                            line_number++;
725
5.37k
                            ++c;
726
5.37k
                            block << '\n';
727
5.37k
                        }
728
17.5k
                        line_start = c;
729
17.5k
                        const char *first_line = c;
730
17.5k
                        int ws_chars = whitespace_check(first_line, c);
731
17.5k
                        string_block_indent = std::string(first_line, ws_chars);
732
17.5k
                        if (ws_chars == 0) {
733
83
                            auto msg = "text block's first line must start with whitespace.";
734
83
                            throw StaticError(filename, begin, msg);
735
83
                        }
736
22.1k
                        while (true) {
737
22.1k
                            assert(ws_chars > 0);
738
                            // Read up to the \n
739
14.7M
                            for (c = &c[ws_chars]; *c != '\n'; ++c) {
740
14.7M
                                if (*c == '\0')
741
173
                                    throw StaticError(filename, begin, "unexpected EOF");
742
14.7M
                                block << *c;
743
14.7M
                            }
744
                            // Add the \n
745
21.9k
                            block << '\n';
746
21.9k
                            ++c;
747
21.9k
                            line_number++;
748
21.9k
                            line_start = c;
749
                            // Skip any blank lines
750
24.6k
                            while (*c == '\n') {
751
2.66k
                                line_number++;
752
2.66k
                                ++c;
753
2.66k
                                block << '\n';
754
2.66k
                            }
755
                            // Examine next line
756
21.9k
                            ws_chars = whitespace_check(first_line, c);
757
21.9k
                            if (ws_chars == 0) {
758
                                // End of text block
759
                                // Skip over any whitespace
760
91.0k
                                while (*c == ' ' || *c == '\t') {
761
73.7k
                                    string_block_term_indent += *c;
762
73.7k
                                    ++c;
763
73.7k
                                }
764
                                // Expect |||
765
17.2k
                                if (!(*c == '|' && *(c + 1) == '|' && *(c + 2) == '|')) {
766
231
                                    auto msg = "text block not terminated with |||";
767
231
                                    throw StaticError(filename, begin, msg);
768
231
                                }
769
17.0k
                                c += 3;  // Leave after the last |
770
17.0k
                                data = block.str();
771
17.0k
                                kind = Token::STRING_BLOCK;
772
17.0k
                                if (chomp_trailing_nl) {
773
1.01k
                                    assert(data.back() == '\n');
774
1.01k
                                    data.pop_back();
775
1.01k
                                }
776
17.0k
                                break;  // Out of the while loop.
777
17.0k
                            }
778
21.9k
                        }
779
780
17.0k
                        break;  // Out of the switch.
781
17.4k
                    }
782
783
37.0M
                    const char *operator_begin = c;
784
120M
                    for (; is_symbol(*c); ++c) {
785
                        // Not allowed // in operators
786
83.6M
                        if (*c == '/' && *(c + 1) == '/')
787
1.93k
                            break;
788
                        // Not allowed /* in operators
789
83.6M
                        if (*c == '/' && *(c + 1) == '*')
790
1.55k
                            break;
791
                        // Not allowed ||| in operators
792
83.6M
                        if (*c == '|' && *(c + 1) == '|' && *(c + 2) == '|')
793
2.04k
                            break;
794
83.6M
                    }
795
                    // Not allowed to end with a + - ~ ! unless a single char.
796
                    // So, wind it back if we need to (but not too far).
797
48.9M
                    while (c > operator_begin + 1 && !allowed_at_end_of_operator(*(c - 1))) {
798
11.9M
                        c--;
799
11.9M
                    }
800
37.0M
                    data += std::string(operator_begin, c);
801
37.0M
                    if (data == "$") {
802
121k
                        kind = Token::DOLLAR;
803
121k
                        data = "";
804
36.8M
                    } else {
805
36.8M
                        kind = Token::OPERATOR;
806
36.8M
                    }
807
37.0M
                } else {
808
388
                    std::stringstream ss;
809
388
                    ss << "Could not lex the character ";
810
388
                    auto uc = (unsigned char)(*c);
811
388
                    if (*c < 32)
812
352
                        ss << "code " << unsigned(uc);
813
36
                    else
814
36
                        ss << "'" << *c << "'";
815
388
                    throw StaticError(filename, begin, ss.str());
816
388
                }
817
352M
        }
818
819
        // Ensure that a bug in the above code does not cause an infinite memory consuming loop due
820
        // to pushing empty tokens.
821
339M
        if (c == original_c) {
822
0
            throw StaticError(filename, begin, "internal lexing error:  pointer did not advance");
823
0
        }
824
825
339M
        Location end(line_number, (c + 1) - line_start);
826
339M
        r.emplace_back(kind,
827
339M
                       fodder,
828
339M
                       data,
829
339M
                       string_block_indent,
830
339M
                       string_block_term_indent,
831
339M
                       LocationRange(filename, begin, end));
832
339M
        fodder.clear();
833
339M
        fresh_line = false;
834
339M
    }
835
836
46.1k
    Location begin(line_number, c - line_start + 1);
837
46.1k
    Location end(line_number, (c + 1) - line_start + 1);
838
46.1k
    r.emplace_back(Token::END_OF_FILE, fodder, "", "", "", LocationRange(filename, begin, end));
839
46.1k
    return r;
840
47.7k
}
841
842
std::string jsonnet_unlex(const Tokens &tokens)
843
0
{
844
0
    std::stringstream ss;
845
0
    for (const auto &t : tokens) {
846
0
        for (const auto &f : t.fodder) {
847
0
            switch (f.kind) {
848
0
                case FodderElement::LINE_END: {
849
0
                    if (f.comment.size() > 0) {
850
0
                        ss << "LineEnd(" << f.blanks << ", " << f.indent << ", " << f.comment[0]
851
0
                           << ")\n";
852
0
                    } else {
853
0
                        ss << "LineEnd(" << f.blanks << ", " << f.indent << ")\n";
854
0
                    }
855
0
                } break;
856
857
0
                case FodderElement::INTERSTITIAL: {
858
0
                    ss << "Interstitial(" << f.comment[0] << ")\n";
859
0
                } break;
860
861
0
                case FodderElement::PARAGRAPH: {
862
0
                    ss << "Paragraph(\n";
863
0
                    for (const auto &line : f.comment) {
864
0
                        ss << "    " << line << '\n';
865
0
                    }
866
0
                    ss << ")" << f.blanks << "\n";
867
0
                } break;
868
0
            }
869
0
        }
870
0
        if (t.kind == Token::END_OF_FILE) {
871
0
            ss << "EOF\n";
872
0
            break;
873
0
        }
874
0
        if (t.kind == Token::STRING_DOUBLE) {
875
0
            ss << "\"" << t.data << "\"\n";
876
0
        } else if (t.kind == Token::STRING_SINGLE) {
877
0
            ss << "'" << t.data << "'\n";
878
0
        } else if (t.kind == Token::STRING_BLOCK) {
879
0
            ss << "|||\n";
880
0
            ss << t.stringBlockIndent;
881
0
            for (const char *cp = t.data.c_str(); *cp != '\0'; ++cp) {
882
0
                ss << *cp;
883
0
                if (*cp == '\n' && *(cp + 1) != '\n' && *(cp + 1) != '\0') {
884
0
                    ss << t.stringBlockIndent;
885
0
                }
886
0
            }
887
0
            ss << t.stringBlockTermIndent << "|||\n";
888
0
        } else {
889
0
            ss << t.data << "\n";
890
0
        }
891
0
    }
892
0
    return ss.str();
893
0
}
894
895
}  // namespace jsonnet::internal