Coverage Report

Created: 2025-10-27 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/jsonnet/core/lexer.cpp
Line
Count
Source
1
/*
2
Copyright 2015 Google Inc. All rights reserved.
3
4
Licensed under the Apache License, Version 2.0 (the "License");
5
you may not use this file except in compliance with the License.
6
You may obtain a copy of the License at
7
8
    http://www.apache.org/licenses/LICENSE-2.0
9
10
Unless required by applicable law or agreed to in writing, software
11
distributed under the License is distributed on an "AS IS" BASIS,
12
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
See the License for the specific language governing permissions and
14
limitations under the License.
15
*/
16
17
#include <cassert>
18
19
#include <map>
20
#include <sstream>
21
#include <string>
22
23
#include "lexer.h"
24
#include "static_error.h"
25
#include "unicode.h"
26
27
namespace jsonnet::internal {
28
29
static const std::vector<std::string> EMPTY;
30
31
/** Is the char whitespace (excluding \n). */
32
static bool is_horz_ws(char c)
33
915M
{
34
915M
    return c == ' ' || c == '\t' || c == '\r';
35
915M
}
36
37
/** Is the char whitespace. */
38
static bool is_ws(char c)
39
825M
{
40
825M
    return c == '\n' || is_horz_ws(c);
41
825M
}
42
43
/** Strip whitespace from both ends of a string, but only up to margin on the left hand side. */
44
static std::string strip_ws(const std::string &s, unsigned margin)
45
18.6M
{
46
18.6M
    if (s.size() == 0)
47
14.7M
        return s;  // Avoid underflow below.
48
3.88M
    size_t i = 0;
49
9.70M
    while (i < s.length() && is_horz_ws(s[i]) && i < margin)
50
5.81M
        i++;
51
3.88M
    size_t j = s.size();
52
7.33M
    while (j > i && is_horz_ws(s[j - 1])) {
53
3.44M
        j--;
54
3.44M
    }
55
3.88M
    return std::string(&s[i], &s[j]);
56
18.6M
}
57
58
/** Split a string by \n and also strip left (up to margin) & right whitespace from each line. */
59
static std::vector<std::string> line_split(const std::string &s, unsigned margin)
60
428k
{
61
428k
    std::vector<std::string> ret;
62
428k
    std::stringstream ss;
63
130M
    for (size_t i = 0; i < s.length(); ++i) {
64
129M
        if (s[i] == '\n') {
65
18.2M
            ret.emplace_back(strip_ws(ss.str(), margin));
66
18.2M
            ss.str("");
67
111M
        } else {
68
111M
            ss << s[i];
69
111M
        }
70
129M
    }
71
428k
    ret.emplace_back(strip_ws(ss.str(), margin));
72
428k
    return ret;
73
428k
}
74
75
/** Consume whitespace.
76
 *
77
 * Return number of \n and number of spaces after last \n.  Convert \t to spaces.
78
 */
79
static void lex_ws(const char *&c, unsigned &new_lines, unsigned &indent, const char *&line_start,
80
                   unsigned long &line_number)
81
362M
{
82
362M
    indent = 0;
83
362M
    new_lines = 0;
84
825M
    for (; *c != '\0' && is_ws(*c); c++) {
85
462M
        switch (*c) {
86
865k
            case '\r':
87
                // Ignore.
88
865k
                break;
89
90
54.0M
            case '\n':
91
54.0M
                indent = 0;
92
54.0M
                new_lines++;
93
54.0M
                line_number++;
94
54.0M
                line_start = c + 1;
95
54.0M
                break;
96
97
407M
            case ' ': indent += 1; break;
98
99
            // This only works for \t at the beginning of lines, but we strip it everywhere else
100
            // anyway.  The only case where this will cause a problem is spaces followed by \t
101
            // at the beginning of a line.  However that is rare, ill-advised, and if re-indentation
102
            // is enabled it will be fixed later.
103
59.3k
            case '\t': indent += 8; break;
104
462M
        }
105
462M
    }
106
362M
}
107
108
/**
109
# Consume all text until the end of the line, return number of newlines after that and indent
110
*/
111
static void lex_until_newline(const char *&c, std::string &text, unsigned &blanks, unsigned &indent,
112
                              const char *&line_start, unsigned long &line_number)
113
11.2M
{
114
11.2M
    const char *original_c = c;
115
11.2M
    const char *last_non_space = c;
116
138M
    for (; *c != '\0' && *c != '\n'; c++) {
117
127M
        if (!is_horz_ws(*c))
118
109M
            last_non_space = c;
119
127M
    }
120
11.2M
    text = std::string(original_c, last_non_space - original_c + 1);
121
    // Consume subsequent whitespace including the '\n'.
122
11.2M
    unsigned new_lines;
123
11.2M
    lex_ws(c, new_lines, indent, line_start, line_number);
124
11.2M
    blanks = new_lines == 0 ? 0 : new_lines - 1;
125
11.2M
}
126
127
static bool is_upper(char c)
128
983M
{
129
983M
    return c >= 'A' && c <= 'Z';
130
983M
}
131
132
static bool is_lower(char c)
133
966M
{
134
966M
    return c >= 'a' && c <= 'z';
135
966M
}
136
137
static bool is_number(char c)
138
174M
{
139
174M
    return c >= '0' && c <= '9';
140
174M
}
141
142
static bool is_identifier_first(char c)
143
983M
{
144
983M
    return is_upper(c) || is_lower(c) || c == '_';
145
983M
}
146
147
static bool is_identifier(char c)
148
789M
{
149
789M
    return is_identifier_first(c) || is_number(c);
150
789M
}
151
152
static bool is_symbol(char c)
153
169M
{
154
169M
    switch (c) {
155
3.74M
        case '!':
156
4.34M
        case '$':
157
21.8M
        case ':':
158
22.9M
        case '~':
159
50.1M
        case '+':
160
53.3M
        case '-':
161
56.5M
        case '&':
162
58.3M
        case '|':
163
58.3M
        case '^':
164
92.4M
        case '=':
165
95.4M
        case '<':
166
99.4M
        case '>':
167
118M
        case '*':
168
122M
        case '/':
169
123M
        case '%': return true;
170
169M
    }
171
45.9M
    return false;
172
169M
}
173
174
20.9M
bool allowed_at_end_of_operator(char c) {
175
20.9M
    switch (c) {
176
7.88M
        case '+':
177
8.44M
        case '-':
178
9.52M
        case '~':
179
10.3M
        case '!':
180
10.6M
        case '$': return false;
181
20.9M
    }
182
10.3M
    return true;
183
20.9M
}
184
185
static const std::map<std::string, Token::Kind> keywords = {
186
    {"assert", Token::ASSERT},
187
    {"else", Token::ELSE},
188
    {"error", Token::ERROR},
189
    {"false", Token::FALSE},
190
    {"for", Token::FOR},
191
    {"function", Token::FUNCTION},
192
    {"if", Token::IF},
193
    {"import", Token::IMPORT},
194
    {"importstr", Token::IMPORTSTR},
195
    {"importbin", Token::IMPORTBIN},
196
    {"in", Token::IN},
197
    {"local", Token::LOCAL},
198
    {"null", Token::NULL_LIT},
199
    {"self", Token::SELF},
200
    {"super", Token::SUPER},
201
    {"tailstrict", Token::TAILSTRICT},
202
    {"then", Token::THEN},
203
    {"true", Token::TRUE},
204
};
205
206
Token::Kind lex_get_keyword_kind(const std::string &identifier)
207
143M
{
208
143M
    auto it = keywords.find(identifier);
209
143M
    if (it == keywords.end())
210
105M
        return Token::IDENTIFIER;
211
38.2M
    return it->second;
212
143M
}
213
214
std::string lex_number(const char *&c, const std::string &filename, const Location &begin)
215
19.7M
{
216
    // This function should be understood with reference to the linked image:
217
    // https://www.json.org/img/number.png
218
219
    // Note, we deviate from the json.org documentation as follows:
220
    // There is no reason to lex negative numbers as atomic tokens, it is better to parse them
221
    // as a unary operator combined with a numeric literal.  This avoids x-1 being tokenized as
222
    // <identifier> <number> instead of the intended <identifier> <binop> <number>.
223
224
19.7M
    enum State {
225
19.7M
        BEGIN,
226
19.7M
        AFTER_ZERO,
227
19.7M
        AFTER_ONE_TO_NINE,
228
19.7M
        AFTER_DOT,
229
19.7M
        AFTER_DIGIT,
230
19.7M
        AFTER_E,
231
19.7M
        AFTER_EXP_SIGN,
232
19.7M
        AFTER_EXP_DIGIT
233
19.7M
    } state;
234
235
19.7M
    std::string r;
236
237
19.7M
    state = BEGIN;
238
45.0M
    while (true) {
239
45.0M
        switch (state) {
240
19.7M
            case BEGIN:
241
19.7M
                switch (*c) {
242
8.28M
                    case '0': state = AFTER_ZERO; break;
243
244
4.46M
                    case '1':
245
5.34M
                    case '2':
246
6.18M
                    case '3':
247
7.25M
                    case '4':
248
7.47M
                    case '5':
249
8.31M
                    case '6':
250
8.73M
                    case '7':
251
10.2M
                    case '8':
252
11.4M
                    case '9': state = AFTER_ONE_TO_NINE; break;
253
254
0
                    default: throw StaticError(filename, begin, "couldn't lex number");
255
19.7M
                }
256
19.7M
                break;
257
258
19.7M
            case AFTER_ZERO:
259
8.28M
                switch (*c) {
260
49.6k
                    case '.': state = AFTER_DOT; break;
261
262
2.35k
                    case 'e':
263
5.66k
                    case 'E': state = AFTER_E; break;
264
265
8.23M
                    default: goto end;
266
8.28M
                }
267
55.3k
                break;
268
269
14.4M
            case AFTER_ONE_TO_NINE:
270
14.4M
                switch (*c) {
271
31.5k
                    case '.': state = AFTER_DOT; break;
272
273
2.47k
                    case 'e':
274
275k
                    case 'E': state = AFTER_E; break;
275
276
1.07M
                    case '0':
277
1.12M
                    case '1':
278
1.39M
                    case '2':
279
1.49M
                    case '3':
280
1.60M
                    case '4':
281
1.84M
                    case '5':
282
2.00M
                    case '6':
283
2.07M
                    case '7':
284
2.15M
                    case '8':
285
2.99M
                    case '9': state = AFTER_ONE_TO_NINE; break;
286
287
11.1M
                    default: goto end;
288
14.4M
                }
289
3.29M
                break;
290
291
3.29M
            case AFTER_DOT:
292
81.2k
                switch (*c) {
293
3.46k
                    case '0':
294
28.7k
                    case '1':
295
29.2k
                    case '2':
296
30.1k
                    case '3':
297
31.0k
                    case '4':
298
79.2k
                    case '5':
299
80.1k
                    case '6':
300
80.1k
                    case '7':
301
80.6k
                    case '8':
302
81.1k
                    case '9': state = AFTER_DIGIT; break;
303
304
29
                    default: {
305
29
                        std::stringstream ss;
306
29
                        ss << "couldn't lex number, junk after decimal point: " << *c;
307
29
                        throw StaticError(filename, begin, ss.str());
308
80.6k
                    }
309
81.2k
                }
310
81.1k
                break;
311
312
1.65M
            case AFTER_DIGIT:
313
1.65M
                switch (*c) {
314
2.57k
                    case 'e':
315
3.82k
                    case 'E': state = AFTER_E; break;
316
317
1.10M
                    case '0':
318
1.18M
                    case '1':
319
1.21M
                    case '2':
320
1.26M
                    case '3':
321
1.29M
                    case '4':
322
1.37M
                    case '5':
323
1.43M
                    case '6':
324
1.46M
                    case '7':
325
1.49M
                    case '8':
326
1.57M
                    case '9': state = AFTER_DIGIT; break;
327
328
77.3k
                    default: goto end;
329
1.65M
                }
330
1.57M
                break;
331
332
1.57M
            case AFTER_E:
333
284k
                switch (*c) {
334
1.69k
                    case '+':
335
4.49k
                    case '-': state = AFTER_EXP_SIGN; break;
336
337
3.01k
                    case '0':
338
6.54k
                    case '1':
339
275k
                    case '2':
340
277k
                    case '3':
341
278k
                    case '4':
342
279k
                    case '5':
343
279k
                    case '6':
344
279k
                    case '7':
345
279k
                    case '8':
346
280k
                    case '9': state = AFTER_EXP_DIGIT; break;
347
348
82
                    default: {
349
82
                        std::stringstream ss;
350
82
                        ss << "couldn't lex number, junk after 'E': " << *c;
351
82
                        throw StaticError(filename, begin, ss.str());
352
279k
                    }
353
284k
                }
354
284k
                break;
355
356
284k
            case AFTER_EXP_SIGN:
357
4.49k
                switch (*c) {
358
612
                    case '0':
359
883
                    case '1':
360
982
                    case '2':
361
3.98k
                    case '3':
362
4.37k
                    case '4':
363
4.38k
                    case '5':
364
4.38k
                    case '6':
365
4.39k
                    case '7':
366
4.40k
                    case '8':
367
4.47k
                    case '9': state = AFTER_EXP_DIGIT; break;
368
369
17
                    default: {
370
17
                        std::stringstream ss;
371
17
                        ss << "couldn't lex number, junk after exponent sign: " << *c;
372
17
                        throw StaticError(filename, begin, ss.str());
373
4.40k
                    }
374
4.49k
                }
375
4.47k
                break;
376
377
573k
            case AFTER_EXP_DIGIT:
378
573k
                switch (*c) {
379
4.89k
                    case '0':
380
7.28k
                    case '1':
381
9.80k
                    case '2':
382
277k
                    case '3':
383
279k
                    case '4':
384
281k
                    case '5':
385
282k
                    case '6':
386
284k
                    case '7':
387
286k
                    case '8':
388
288k
                    case '9': state = AFTER_EXP_DIGIT; break;
389
390
284k
                    default: goto end;
391
573k
                }
392
288k
                break;
393
45.0M
        }
394
25.3M
        r += *c;
395
25.3M
        c++;
396
25.3M
    }
397
19.7M
end:
398
19.7M
    return r;
399
19.7M
}
400
401
// Check that b has at least the same whitespace prefix as a and returns the amount of this
402
// whitespace, otherwise returns 0.  If a has no whitespace prefix than return 0.
403
static int whitespace_check(const char *a, const char *b)
404
39.8k
{
405
39.8k
    int i = 0;
406
738k
    while (a[i] == ' ' || a[i] == '\t') {
407
715k
        if (b[i] != a[i])
408
17.2k
            return 0;
409
698k
        i++;
410
698k
    }
411
22.5k
    return i;
412
39.8k
}
413
414
/*
415
static void add_whitespace(Fodder &fodder, const char *s, size_t n)
416
{
417
    std::string ws(s, n);
418
    if (fodder.size() == 0 || fodder.back().kind != FodderElement::WHITESPACE) {
419
        fodder.emplace_back(FodderElement::WHITESPACE, ws);
420
    } else {
421
        fodder.back().data += ws;
422
    }
423
}
424
*/
425
426
Tokens jsonnet_lex(const std::string &filename, const char *input)
427
47.8k
{
428
47.8k
    unsigned long line_number = 1;
429
47.8k
    const char *line_start = input;
430
431
47.8k
    Tokens r;
432
433
47.8k
    const char *c = input;
434
435
47.8k
    Fodder fodder;
436
47.8k
    bool fresh_line = true;  // Are we tokenizing from the beginning of a new line?
437
438
350M
    while (*c != '\0') {
439
        // Used to ensure we have actually advanced the pointer by the end of the iteration.
440
350M
        const char *original_c = c;
441
442
350M
        Token::Kind kind;
443
350M
        std::string data;
444
350M
        std::string string_block_indent;
445
350M
        std::string string_block_term_indent;
446
447
350M
        unsigned new_lines, indent;
448
350M
        lex_ws(c, new_lines, indent, line_start, line_number);
449
450
        // If it's the end of the file, discard final whitespace.
451
350M
        if (*c == '\0')
452
24.6k
            break;
453
454
350M
        if (new_lines > 0) {
455
            // Otherwise store whitespace in fodder.
456
35.8M
            unsigned blanks = new_lines - 1;
457
35.8M
            fodder.emplace_back(FodderElement::LINE_END, blanks, indent, EMPTY);
458
35.8M
            fresh_line = true;
459
35.8M
        }
460
461
350M
        Location begin(line_number, c - line_start + 1);
462
463
350M
        switch (*c) {
464
            // The following operators should never be combined with subsequent symbols.
465
1.48M
            case '{':
466
1.48M
                kind = Token::BRACE_L;
467
1.48M
                c++;
468
1.48M
                break;
469
470
1.46M
            case '}':
471
1.46M
                kind = Token::BRACE_R;
472
1.46M
                c++;
473
1.46M
                break;
474
475
7.63M
            case '[':
476
7.63M
                kind = Token::BRACKET_L;
477
7.63M
                c++;
478
7.63M
                break;
479
480
7.59M
            case ']':
481
7.59M
                kind = Token::BRACKET_R;
482
7.59M
                c++;
483
7.59M
                break;
484
485
31.2M
            case ',':
486
31.2M
                kind = Token::COMMA;
487
31.2M
                c++;
488
31.2M
                break;
489
490
17.2M
            case '.':
491
17.2M
                kind = Token::DOT;
492
17.2M
                c++;
493
17.2M
                break;
494
495
25.9M
            case '(':
496
25.9M
                kind = Token::PAREN_L;
497
25.9M
                c++;
498
25.9M
                break;
499
500
25.9M
            case ')':
501
25.9M
                kind = Token::PAREN_R;
502
25.9M
                c++;
503
25.9M
                break;
504
505
6.76M
            case ';':
506
6.76M
                kind = Token::SEMICOLON;
507
6.76M
                c++;
508
6.76M
                break;
509
510
            // Numeric literals.
511
8.28M
            case '0':
512
12.7M
            case '1':
513
13.6M
            case '2':
514
14.4M
            case '3':
515
15.5M
            case '4':
516
15.7M
            case '5':
517
16.5M
            case '6':
518
17.0M
            case '7':
519
18.5M
            case '8':
520
19.7M
            case '9':
521
19.7M
                kind = Token::NUMBER;
522
19.7M
                data = lex_number(c, filename, begin);
523
19.7M
                break;
524
525
            // UString literals.
526
338k
            case '"': {
527
338k
                c++;
528
83.4M
                for (;; ++c) {
529
83.4M
                    if (*c == '\0') {
530
95
                        throw StaticError(filename, begin, "unterminated string");
531
95
                    }
532
83.4M
                    if (*c == '"') {
533
338k
                        break;
534
338k
                    }
535
83.0M
                    if (*c == '\\' && *(c + 1) != '\0') {
536
223k
                        data += *c;
537
223k
                        ++c;
538
223k
                    }
539
83.0M
                    if (*c == '\n') {
540
                        // Maintain line/column counters.
541
6.83M
                        line_number++;
542
6.83M
                        line_start = c + 1;
543
6.83M
                    }
544
83.0M
                    data += *c;
545
83.0M
                }
546
338k
                c++;  // Advance beyond the ".
547
338k
                kind = Token::STRING_DOUBLE;
548
338k
            } break;
549
550
            // UString literals.
551
11.2M
            case '\'': {
552
11.2M
                c++;
553
150M
                for (;; ++c) {
554
150M
                    if (*c == '\0') {
555
75
                        throw StaticError(filename, begin, "unterminated string");
556
75
                    }
557
150M
                    if (*c == '\'') {
558
11.2M
                        break;
559
11.2M
                    }
560
138M
                    if (*c == '\\' && *(c + 1) != '\0') {
561
1.22M
                        data += *c;
562
1.22M
                        ++c;
563
1.22M
                    }
564
138M
                    if (*c == '\n') {
565
                        // Maintain line/column counters.
566
4.77M
                        line_number++;
567
4.77M
                        line_start = c + 1;
568
4.77M
                    }
569
138M
                    data += *c;
570
138M
                }
571
11.2M
                c++;  // Advance beyond the '.
572
11.2M
                kind = Token::STRING_SINGLE;
573
11.2M
            } break;
574
575
            // Verbatim string literals.
576
            // ' and " quoting is interpreted here, unlike non-verbatim strings
577
            // where it is done later by jsonnet_string_unescape.  This is OK
578
            // in this case because no information is lost by resoving the
579
            // repeated quote into a single quote, so we can go back to the
580
            // original form in the formatter.
581
10.8k
            case '@': {
582
10.8k
                c++;
583
10.8k
                if (*c != '"' && *c != '\'') {
584
38
                    std::stringstream ss;
585
38
                    ss << "couldn't lex verbatim string, junk after '@': " << *c;
586
38
                    throw StaticError(filename, begin, ss.str());
587
38
                }
588
10.8k
                const char quot = *c;
589
10.8k
                c++;  // Advance beyond the opening quote.
590
420k
                for (;; ++c) {
591
420k
                    if (*c == '\0') {
592
74
                        throw StaticError(filename, begin, "unterminated verbatim string");
593
74
                    }
594
420k
                    if (*c == quot) {
595
13.5k
                        if (*(c + 1) == quot) {
596
2.75k
                            c++;
597
10.7k
                        } else {
598
10.7k
                            break;
599
10.7k
                        }
600
13.5k
                    }
601
409k
                    data += *c;
602
409k
                }
603
10.7k
                c++;  // Advance beyond the closing quote.
604
10.7k
                if (quot == '"') {
605
7.49k
                    kind = Token::VERBATIM_STRING_DOUBLE;
606
7.49k
                } else {
607
3.28k
                    kind = Token::VERBATIM_STRING_SINGLE;
608
3.28k
                }
609
10.7k
            } break;
610
611
            // Keywords
612
193M
            default:
613
193M
                if (is_identifier_first(*c)) {
614
143M
                    std::string id;
615
789M
                    for (; is_identifier(*c); ++c)
616
646M
                        id += *c;
617
143M
                    kind = lex_get_keyword_kind(id);
618
143M
                    data = id;
619
620
143M
                } else if (is_symbol(*c) || *c == '#') {
621
                    // Single line C++ and Python style comments.
622
49.6M
                    if (*c == '#' || (*c == '/' && *(c + 1) == '/')) {
623
11.2M
                        std::vector<std::string> comment(1);
624
11.2M
                        unsigned blanks;
625
11.2M
                        unsigned indent;
626
11.2M
                        lex_until_newline(c, comment[0], blanks, indent, line_start, line_number);
627
11.2M
                        auto kind = fresh_line ? FodderElement::PARAGRAPH : FodderElement::LINE_END;
628
11.2M
                        fodder.emplace_back(kind, blanks, indent, comment);
629
11.2M
                        fresh_line = true;
630
11.2M
                        continue;  // We've not got a token, just fodder, so keep scanning.
631
11.2M
                    }
632
633
                    // Multi-line C style comment.
634
38.3M
                    if (*c == '/' && *(c + 1) == '*') {
635
1.42M
                        unsigned margin = c - line_start;
636
637
1.42M
                        const char *initial_c = c;
638
1.42M
                        c += 2;  // Avoid matching /*/: skip the /* before starting the search for
639
                                 // */.
640
641
135M
                        while (!(*c == '*' && *(c + 1) == '/')) {
642
134M
                            if (*c == '\0') {
643
211
                                auto msg = "multi-line comment has no terminating */.";
644
211
                                throw StaticError(filename, begin, msg);
645
211
                            }
646
134M
                            if (*c == '\n') {
647
                                // Just keep track of the line / column counters.
648
18.2M
                                line_number++;
649
18.2M
                                line_start = c + 1;
650
18.2M
                            }
651
134M
                            ++c;
652
134M
                        }
653
1.42M
                        c += 2;  // Move the pointer to the char after the closing '/'.
654
655
1.42M
                        std::string comment(initial_c,
656
1.42M
                                            c - initial_c);  // Includes the "/*" and "*/".
657
658
                        // Lex whitespace after comment
659
1.42M
                        unsigned new_lines_after, indent_after;
660
1.42M
                        lex_ws(c, new_lines_after, indent_after, line_start, line_number);
661
1.42M
                        std::vector<std::string> lines;
662
1.42M
                        if (comment.find('\n') >= comment.length()) {
663
                            // Comment looks like /* foo */
664
998k
                            lines.push_back(comment);
665
998k
                            fodder.emplace_back(FodderElement::INTERSTITIAL, 0, 0, lines);
666
998k
                            if (new_lines_after > 0) {
667
937k
                                fodder.emplace_back(FodderElement::LINE_END,
668
937k
                                                    new_lines_after - 1,
669
937k
                                                    indent_after,
670
937k
                                                    EMPTY);
671
937k
                                fresh_line = true;
672
937k
                            }
673
998k
                        } else {
674
428k
                            lines = line_split(comment, margin);
675
428k
                            assert(lines[0][0] == '/');
676
                            // Little hack to support PARAGRAPHs with * down the LHS:
677
                            // Add a space to lines that start with a '*'
678
428k
                            bool all_star = true;
679
18.6M
                            for (auto &l : lines) {
680
18.6M
                                if (l[0] != '*')
681
18.4M
                                    all_star = false;
682
18.6M
                            }
683
428k
                            if (all_star) {
684
0
                                for (auto &l : lines) {
685
0
                                    if (l[0] == '*')
686
0
                                        l = " " + l;
687
0
                                }
688
0
                            }
689
428k
                            if (new_lines_after == 0) {
690
                                // Ensure a line end after the paragraph.
691
27.0k
                                new_lines_after = 1;
692
27.0k
                                indent_after = 0;
693
27.0k
                            }
694
428k
                            fodder_push_back(fodder,
695
428k
                                             FodderElement(FodderElement::PARAGRAPH,
696
428k
                                                           new_lines_after - 1,
697
428k
                                                           indent_after,
698
428k
                                                           lines));
699
428k
                            fresh_line = true;
700
428k
                        }
701
1.42M
                        continue;  // We've not got a token, just fodder, so keep scanning.
702
1.42M
                    }
703
704
                    // Text block
705
36.9M
                    if (*c == '|' && *(c + 1) == '|' && *(c + 2) == '|') {
706
17.6k
                        c += 3;  // Skip the "|||".
707
708
17.6k
                        bool chomp_trailing_nl = false;
709
17.6k
                        if (*c == '-') {
710
1.04k
                            chomp_trailing_nl = true;
711
1.04k
                            c++;
712
1.04k
                        }
713
714
21.2k
                        while (is_horz_ws(*c)) ++c;  // Chomp whitespace at end of line.
715
17.6k
                        if (*c != '\n') {
716
111
                            auto msg = "text block syntax requires new line after |||.";
717
111
                            throw StaticError(filename, begin, msg);
718
111
                        }
719
17.5k
                        std::stringstream block;
720
17.5k
                        c++;  // Skip the "\n"
721
17.5k
                        line_number++;
722
                        // Skip any blank lines at the beginning of the block.
723
22.7k
                        while (*c == '\n') {
724
5.21k
                            line_number++;
725
5.21k
                            ++c;
726
5.21k
                            block << '\n';
727
5.21k
                        }
728
17.5k
                        line_start = c;
729
17.5k
                        const char *first_line = c;
730
17.5k
                        int ws_chars = whitespace_check(first_line, c);
731
17.5k
                        string_block_indent = std::string(first_line, ws_chars);
732
17.5k
                        if (ws_chars == 0) {
733
83
                            auto msg = "text block's first line must start with whitespace.";
734
83
                            throw StaticError(filename, begin, msg);
735
83
                        }
736
22.4k
                        while (true) {
737
22.4k
                            assert(ws_chars > 0);
738
                            // Read up to the \n
739
14.0M
                            for (c = &c[ws_chars]; *c != '\n'; ++c) {
740
14.0M
                                if (*c == '\0')
741
173
                                    throw StaticError(filename, begin, "unexpected EOF");
742
14.0M
                                block << *c;
743
14.0M
                            }
744
                            // Add the \n
745
22.2k
                            block << '\n';
746
22.2k
                            ++c;
747
22.2k
                            line_number++;
748
22.2k
                            line_start = c;
749
                            // Skip any blank lines
750
25.0k
                            while (*c == '\n') {
751
2.74k
                                line_number++;
752
2.74k
                                ++c;
753
2.74k
                                block << '\n';
754
2.74k
                            }
755
                            // Examine next line
756
22.2k
                            ws_chars = whitespace_check(first_line, c);
757
22.2k
                            if (ws_chars == 0) {
758
                                // End of text block
759
                                // Skip over any whitespace
760
75.5k
                                while (*c == ' ' || *c == '\t') {
761
58.2k
                                    string_block_term_indent += *c;
762
58.2k
                                    ++c;
763
58.2k
                                }
764
                                // Expect |||
765
17.2k
                                if (!(*c == '|' && *(c + 1) == '|' && *(c + 2) == '|')) {
766
240
                                    auto msg = "text block not terminated with |||";
767
240
                                    throw StaticError(filename, begin, msg);
768
240
                                }
769
17.0k
                                c += 3;  // Leave after the last |
770
17.0k
                                data = block.str();
771
17.0k
                                kind = Token::STRING_BLOCK;
772
17.0k
                                if (chomp_trailing_nl) {
773
1.02k
                                    assert(data.back() == '\n');
774
1.02k
                                    data.pop_back();
775
1.02k
                                }
776
17.0k
                                break;  // Out of the while loop.
777
17.0k
                            }
778
22.2k
                        }
779
780
17.0k
                        break;  // Out of the switch.
781
17.4k
                    }
782
783
36.8M
                    const char *operator_begin = c;
784
120M
                    for (; is_symbol(*c); ++c) {
785
                        // Not allowed // in operators
786
83.3M
                        if (*c == '/' && *(c + 1) == '/')
787
1.71k
                            break;
788
                        // Not allowed /* in operators
789
83.3M
                        if (*c == '/' && *(c + 1) == '*')
790
1.61k
                            break;
791
                        // Not allowed ||| in operators
792
83.3M
                        if (*c == '|' && *(c + 1) == '|' && *(c + 2) == '|')
793
2.06k
                            break;
794
83.3M
                    }
795
                    // Not allowed to end with a + - ~ ! unless a single char.
796
                    // So, wind it back if we need to (but not too far).
797
47.5M
                    while (c > operator_begin + 1 && !allowed_at_end_of_operator(*(c - 1))) {
798
10.6M
                        c--;
799
10.6M
                    }
800
36.8M
                    data += std::string(operator_begin, c);
801
36.8M
                    if (data == "$") {
802
144k
                        kind = Token::DOLLAR;
803
144k
                        data = "";
804
36.7M
                    } else {
805
36.7M
                        kind = Token::OPERATOR;
806
36.7M
                    }
807
36.8M
                } else {
808
410
                    std::stringstream ss;
809
410
                    ss << "Could not lex the character ";
810
410
                    auto uc = (unsigned char)(*c);
811
410
                    if (*c < 32)
812
374
                        ss << "code " << unsigned(uc);
813
36
                    else
814
36
                        ss << "'" << *c << "'";
815
410
                    throw StaticError(filename, begin, ss.str());
816
410
                }
817
350M
        }
818
819
        // Ensure that a bug in the above code does not cause an infinite memory consuming loop due
820
        // to pushing empty tokens.
821
337M
        if (c == original_c) {
822
0
            throw StaticError(filename, begin, "internal lexing error:  pointer did not advance");
823
0
        }
824
825
337M
        Location end(line_number, (c + 1) - line_start);
826
337M
        r.emplace_back(kind,
827
337M
                       fodder,
828
337M
                       data,
829
337M
                       string_block_indent,
830
337M
                       string_block_term_indent,
831
337M
                       LocationRange(filename, begin, end));
832
337M
        fodder.clear();
833
337M
        fresh_line = false;
834
337M
    }
835
836
46.2k
    Location begin(line_number, c - line_start + 1);
837
46.2k
    Location end(line_number, (c + 1) - line_start + 1);
838
46.2k
    r.emplace_back(Token::END_OF_FILE, fodder, "", "", "", LocationRange(filename, begin, end));
839
46.2k
    return r;
840
47.8k
}
841
842
std::string jsonnet_unlex(const Tokens &tokens)
843
0
{
844
0
    std::stringstream ss;
845
0
    for (const auto &t : tokens) {
846
0
        for (const auto &f : t.fodder) {
847
0
            switch (f.kind) {
848
0
                case FodderElement::LINE_END: {
849
0
                    if (f.comment.size() > 0) {
850
0
                        ss << "LineEnd(" << f.blanks << ", " << f.indent << ", " << f.comment[0]
851
0
                           << ")\n";
852
0
                    } else {
853
0
                        ss << "LineEnd(" << f.blanks << ", " << f.indent << ")\n";
854
0
                    }
855
0
                } break;
856
857
0
                case FodderElement::INTERSTITIAL: {
858
0
                    ss << "Interstitial(" << f.comment[0] << ")\n";
859
0
                } break;
860
861
0
                case FodderElement::PARAGRAPH: {
862
0
                    ss << "Paragraph(\n";
863
0
                    for (const auto &line : f.comment) {
864
0
                        ss << "    " << line << '\n';
865
0
                    }
866
0
                    ss << ")" << f.blanks << "\n";
867
0
                } break;
868
0
            }
869
0
        }
870
0
        if (t.kind == Token::END_OF_FILE) {
871
0
            ss << "EOF\n";
872
0
            break;
873
0
        }
874
0
        if (t.kind == Token::STRING_DOUBLE) {
875
0
            ss << "\"" << t.data << "\"\n";
876
0
        } else if (t.kind == Token::STRING_SINGLE) {
877
0
            ss << "'" << t.data << "'\n";
878
0
        } else if (t.kind == Token::STRING_BLOCK) {
879
0
            ss << "|||\n";
880
0
            ss << t.stringBlockIndent;
881
0
            for (const char *cp = t.data.c_str(); *cp != '\0'; ++cp) {
882
0
                ss << *cp;
883
0
                if (*cp == '\n' && *(cp + 1) != '\n' && *(cp + 1) != '\0') {
884
0
                    ss << t.stringBlockIndent;
885
0
                }
886
0
            }
887
0
            ss << t.stringBlockTermIndent << "|||\n";
888
0
        } else {
889
0
            ss << t.data << "\n";
890
0
        }
891
0
    }
892
0
    return ss.str();
893
0
}
894
895
}  // namespace jsonnet::internal