Coverage Report

Created: 2025-10-12 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/jsonnet/core/lexer.cpp
Line
Count
Source
1
/*
2
Copyright 2015 Google Inc. All rights reserved.
3
4
Licensed under the Apache License, Version 2.0 (the "License");
5
you may not use this file except in compliance with the License.
6
You may obtain a copy of the License at
7
8
    http://www.apache.org/licenses/LICENSE-2.0
9
10
Unless required by applicable law or agreed to in writing, software
11
distributed under the License is distributed on an "AS IS" BASIS,
12
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
See the License for the specific language governing permissions and
14
limitations under the License.
15
*/
16
17
#include <cassert>
18
19
#include <map>
20
#include <sstream>
21
#include <string>
22
23
#include "lexer.h"
24
#include "static_error.h"
25
#include "unicode.h"
26
27
namespace jsonnet::internal {
28
29
static const std::vector<std::string> EMPTY;
30
31
/** Is the char whitespace (excluding \n). */
32
static bool is_horz_ws(char c)
33
887M
{
34
887M
    return c == ' ' || c == '\t' || c == '\r';
35
887M
}
36
37
/** Is the char whitespace. */
38
static bool is_ws(char c)
39
798M
{
40
798M
    return c == '\n' || is_horz_ws(c);
41
798M
}
42
43
/** Strip whitespace from both ends of a string, but only up to margin on the left hand side. */
44
static std::string strip_ws(const std::string &s, unsigned margin)
45
19.6M
{
46
19.6M
    if (s.size() == 0)
47
15.0M
        return s;  // Avoid underflow below.
48
4.68M
    size_t i = 0;
49
10.3M
    while (i < s.length() && is_horz_ws(s[i]) && i < margin)
50
5.61M
        i++;
51
4.68M
    size_t j = s.size();
52
8.02M
    while (j > i && is_horz_ws(s[j - 1])) {
53
3.34M
        j--;
54
3.34M
    }
55
4.68M
    return std::string(&s[i], &s[j]);
56
19.6M
}
57
58
/** Split a string by \n and also strip left (up to margin) & right whitespace from each line. */
59
static std::vector<std::string> line_split(const std::string &s, unsigned margin)
60
524k
{
61
524k
    std::vector<std::string> ret;
62
524k
    std::stringstream ss;
63
131M
    for (size_t i = 0; i < s.length(); ++i) {
64
130M
        if (s[i] == '\n') {
65
19.1M
            ret.emplace_back(strip_ws(ss.str(), margin));
66
19.1M
            ss.str("");
67
111M
        } else {
68
111M
            ss << s[i];
69
111M
        }
70
130M
    }
71
524k
    ret.emplace_back(strip_ws(ss.str(), margin));
72
524k
    return ret;
73
524k
}
74
75
/** Consume whitespace.
76
 *
77
 * Return number of \n and number of spaces after last \n.  Convert \t to spaces.
78
 */
79
static void lex_ws(const char *&c, unsigned &new_lines, unsigned &indent, const char *&line_start,
80
                   unsigned long &line_number)
81
351M
{
82
351M
    indent = 0;
83
351M
    new_lines = 0;
84
798M
    for (; *c != '\0' && is_ws(*c); c++) {
85
446M
        switch (*c) {
86
1.01M
            case '\r':
87
                // Ignore.
88
1.01M
                break;
89
90
52.6M
            case '\n':
91
52.6M
                indent = 0;
92
52.6M
                new_lines++;
93
52.6M
                line_number++;
94
52.6M
                line_start = c + 1;
95
52.6M
                break;
96
97
392M
            case ' ': indent += 1; break;
98
99
            // This only works for \t at the beginning of lines, but we strip it everywhere else
100
            // anyway.  The only case where this will cause a problem is spaces followed by \t
101
            // at the beginning of a line.  However that is rare, ill-advised, and if re-indentation
102
            // is enabled it will be fixed later.
103
48.1k
            case '\t': indent += 8; break;
104
446M
        }
105
446M
    }
106
351M
}
107
108
/**
109
# Consume all text until the end of the line, return number of newlines after that and indent
110
*/
111
static void lex_until_newline(const char *&c, std::string &text, unsigned &blanks, unsigned &indent,
112
                              const char *&line_start, unsigned long &line_number)
113
10.7M
{
114
10.7M
    const char *original_c = c;
115
10.7M
    const char *last_non_space = c;
116
134M
    for (; *c != '\0' && *c != '\n'; c++) {
117
123M
        if (!is_horz_ws(*c))
118
106M
            last_non_space = c;
119
123M
    }
120
10.7M
    text = std::string(original_c, last_non_space - original_c + 1);
121
    // Consume subsequent whitespace including the '\n'.
122
10.7M
    unsigned new_lines;
123
10.7M
    lex_ws(c, new_lines, indent, line_start, line_number);
124
10.7M
    blanks = new_lines == 0 ? 0 : new_lines - 1;
125
10.7M
}
126
127
static bool is_upper(char c)
128
947M
{
129
947M
    return c >= 'A' && c <= 'Z';
130
947M
}
131
132
static bool is_lower(char c)
133
931M
{
134
931M
    return c >= 'a' && c <= 'z';
135
931M
}
136
137
static bool is_number(char c)
138
167M
{
139
167M
    return c >= '0' && c <= '9';
140
167M
}
141
142
static bool is_identifier_first(char c)
143
947M
{
144
947M
    return is_upper(c) || is_lower(c) || c == '_';
145
947M
}
146
147
static bool is_identifier(char c)
148
760M
{
149
760M
    return is_identifier_first(c) || is_number(c);
150
760M
}
151
152
static bool is_symbol(char c)
153
165M
{
154
165M
    switch (c) {
155
3.90M
        case '!':
156
4.37M
        case '$':
157
21.4M
        case ':':
158
22.5M
        case '~':
159
48.8M
        case '+':
160
52.0M
        case '-':
161
55.1M
        case '&':
162
56.8M
        case '|':
163
56.8M
        case '^':
164
89.5M
        case '=':
165
92.4M
        case '<':
166
96.3M
        case '>':
167
115M
        case '*':
168
119M
        case '/':
169
121M
        case '%': return true;
170
165M
    }
171
44.2M
    return false;
172
165M
}
173
174
20.6M
bool allowed_at_end_of_operator(char c) {
175
20.6M
    switch (c) {
176
7.77M
        case '+':
177
8.33M
        case '-':
178
9.35M
        case '~':
179
10.4M
        case '!':
180
10.6M
        case '$': return false;
181
20.6M
    }
182
9.93M
    return true;
183
20.6M
}
184
185
static const std::map<std::string, Token::Kind> keywords = {
186
    {"assert", Token::ASSERT},
187
    {"else", Token::ELSE},
188
    {"error", Token::ERROR},
189
    {"false", Token::FALSE},
190
    {"for", Token::FOR},
191
    {"function", Token::FUNCTION},
192
    {"if", Token::IF},
193
    {"import", Token::IMPORT},
194
    {"importstr", Token::IMPORTSTR},
195
    {"importbin", Token::IMPORTBIN},
196
    {"in", Token::IN},
197
    {"local", Token::LOCAL},
198
    {"null", Token::NULL_LIT},
199
    {"self", Token::SELF},
200
    {"super", Token::SUPER},
201
    {"tailstrict", Token::TAILSTRICT},
202
    {"then", Token::THEN},
203
    {"true", Token::TRUE},
204
};
205
206
Token::Kind lex_get_keyword_kind(const std::string &identifier)
207
138M
{
208
138M
    auto it = keywords.find(identifier);
209
138M
    if (it == keywords.end())
210
101M
        return Token::IDENTIFIER;
211
36.9M
    return it->second;
212
138M
}
213
214
std::string lex_number(const char *&c, const std::string &filename, const Location &begin)
215
19.7M
{
216
    // This function should be understood with reference to the linked image:
217
    // https://www.json.org/img/number.png
218
219
    // Note, we deviate from the json.org documentation as follows:
220
    // There is no reason to lex negative numbers as atomic tokens, it is better to parse them
221
    // as a unary operator combined with a numeric literal.  This avoids x-1 being tokenized as
222
    // <identifier> <number> instead of the intended <identifier> <binop> <number>.
223
224
19.7M
    enum State {
225
19.7M
        BEGIN,
226
19.7M
        AFTER_ZERO,
227
19.7M
        AFTER_ONE_TO_NINE,
228
19.7M
        AFTER_DOT,
229
19.7M
        AFTER_DIGIT,
230
19.7M
        AFTER_E,
231
19.7M
        AFTER_EXP_SIGN,
232
19.7M
        AFTER_EXP_DIGIT
233
19.7M
    } state;
234
235
19.7M
    std::string r;
236
237
19.7M
    state = BEGIN;
238
45.4M
    while (true) {
239
45.4M
        switch (state) {
240
19.7M
            case BEGIN:
241
19.7M
                switch (*c) {
242
8.02M
                    case '0': state = AFTER_ZERO; break;
243
244
4.34M
                    case '1':
245
5.20M
                    case '2':
246
6.00M
                    case '3':
247
7.27M
                    case '4':
248
7.50M
                    case '5':
249
8.34M
                    case '6':
250
8.68M
                    case '7':
251
10.4M
                    case '8':
252
11.6M
                    case '9': state = AFTER_ONE_TO_NINE; break;
253
254
0
                    default: throw StaticError(filename, begin, "couldn't lex number");
255
19.7M
                }
256
19.7M
                break;
257
258
19.7M
            case AFTER_ZERO:
259
8.02M
                switch (*c) {
260
47.9k
                    case '.': state = AFTER_DOT; break;
261
262
638
                    case 'e':
263
3.74k
                    case 'E': state = AFTER_E; break;
264
265
7.97M
                    default: goto end;
266
8.02M
                }
267
51.6k
                break;
268
269
14.6M
            case AFTER_ONE_TO_NINE:
270
14.6M
                switch (*c) {
271
30.6k
                    case '.': state = AFTER_DOT; break;
272
273
2.46k
                    case 'e':
274
442k
                    case 'E': state = AFTER_E; break;
275
276
1.05M
                    case '0':
277
1.10M
                    case '1':
278
1.37M
                    case '2':
279
1.46M
                    case '3':
280
1.58M
                    case '4':
281
1.81M
                    case '5':
282
1.96M
                    case '6':
283
2.04M
                    case '7':
284
2.12M
                    case '8':
285
2.96M
                    case '9': state = AFTER_ONE_TO_NINE; break;
286
287
11.2M
                    default: goto end;
288
14.6M
                }
289
3.43M
                break;
290
291
3.43M
            case AFTER_DOT:
292
78.5k
                switch (*c) {
293
3.35k
                    case '0':
294
28.1k
                    case '1':
295
28.5k
                    case '2':
296
29.4k
                    case '3':
297
30.2k
                    case '4':
298
76.6k
                    case '5':
299
77.4k
                    case '6':
300
77.6k
                    case '7':
301
78.1k
                    case '8':
302
78.5k
                    case '9': state = AFTER_DIGIT; break;
303
304
27
                    default: {
305
27
                        std::stringstream ss;
306
27
                        ss << "couldn't lex number, junk after decimal point: " << *c;
307
27
                        throw StaticError(filename, begin, ss.str());
308
78.1k
                    }
309
78.5k
                }
310
78.5k
                break;
311
312
1.63M
            case AFTER_DIGIT:
313
1.63M
                switch (*c) {
314
2.37k
                    case 'e':
315
3.32k
                    case 'E': state = AFTER_E; break;
316
317
1.10M
                    case '0':
318
1.18M
                    case '1':
319
1.20M
                    case '2':
320
1.25M
                    case '3':
321
1.28M
                    case '4':
322
1.35M
                    case '5':
323
1.42M
                    case '6':
324
1.44M
                    case '7':
325
1.47M
                    case '8':
326
1.55M
                    case '9': state = AFTER_DIGIT; break;
327
328
75.2k
                    default: goto end;
329
1.63M
                }
330
1.55M
                break;
331
332
1.55M
            case AFTER_E:
333
450k
                switch (*c) {
334
1.87k
                    case '+':
335
4.75k
                    case '-': state = AFTER_EXP_SIGN; break;
336
337
2.37k
                    case '0':
338
4.56k
                    case '1':
339
440k
                    case '2':
340
442k
                    case '3':
341
443k
                    case '4':
342
444k
                    case '5':
343
444k
                    case '6':
344
444k
                    case '7':
345
444k
                    case '8':
346
445k
                    case '9': state = AFTER_EXP_DIGIT; break;
347
348
82
                    default: {
349
82
                        std::stringstream ss;
350
82
                        ss << "couldn't lex number, junk after 'E': " << *c;
351
82
                        throw StaticError(filename, begin, ss.str());
352
444k
                    }
353
450k
                }
354
449k
                break;
355
356
449k
            case AFTER_EXP_SIGN:
357
4.75k
                switch (*c) {
358
550
                    case '0':
359
892
                    case '1':
360
1.05k
                    case '2':
361
4.18k
                    case '3':
362
4.64k
                    case '4':
363
4.64k
                    case '5':
364
4.65k
                    case '6':
365
4.65k
                    case '7':
366
4.66k
                    case '8':
367
4.74k
                    case '9': state = AFTER_EXP_DIGIT; break;
368
369
16
                    default: {
370
16
                        std::stringstream ss;
371
16
                        ss << "couldn't lex number, junk after exponent sign: " << *c;
372
16
                        throw StaticError(filename, begin, ss.str());
373
4.66k
                    }
374
4.75k
                }
375
4.74k
                break;
376
377
905k
            case AFTER_EXP_DIGIT:
378
905k
                switch (*c) {
379
4.65k
                    case '0':
380
8.06k
                    case '1':
381
10.7k
                    case '2':
382
443k
                    case '3':
383
446k
                    case '4':
384
448k
                    case '5':
385
449k
                    case '6':
386
451k
                    case '7':
387
453k
                    case '8':
388
455k
                    case '9': state = AFTER_EXP_DIGIT; break;
389
390
449k
                    default: goto end;
391
905k
                }
392
455k
                break;
393
45.4M
        }
394
25.7M
        r += *c;
395
25.7M
        c++;
396
25.7M
    }
397
19.7M
end:
398
19.7M
    return r;
399
19.7M
}
400
401
// Check that b has at least the same whitespace prefix as a and returns the amount of this
402
// whitespace, otherwise returns 0.  If a has no whitespace prefix than return 0.
403
static int whitespace_check(const char *a, const char *b)
404
40.3k
{
405
40.3k
    int i = 0;
406
651k
    while (a[i] == ' ' || a[i] == '\t') {
407
628k
        if (b[i] != a[i])
408
17.7k
            return 0;
409
610k
        i++;
410
610k
    }
411
22.5k
    return i;
412
40.3k
}
413
414
/*
415
static void add_whitespace(Fodder &fodder, const char *s, size_t n)
416
{
417
    std::string ws(s, n);
418
    if (fodder.size() == 0 || fodder.back().kind != FodderElement::WHITESPACE) {
419
        fodder.emplace_back(FodderElement::WHITESPACE, ws);
420
    } else {
421
        fodder.back().data += ws;
422
    }
423
}
424
*/
425
426
Tokens jsonnet_lex(const std::string &filename, const char *input)
427
46.3k
{
428
46.3k
    unsigned long line_number = 1;
429
46.3k
    const char *line_start = input;
430
431
46.3k
    Tokens r;
432
433
46.3k
    const char *c = input;
434
435
46.3k
    Fodder fodder;
436
46.3k
    bool fresh_line = true;  // Are we tokenizing from the beginning of a new line?
437
438
339M
    while (*c != '\0') {
439
        // Used to ensure we have actually advanced the pointer by the end of the iteration.
440
339M
        const char *original_c = c;
441
442
339M
        Token::Kind kind;
443
339M
        std::string data;
444
339M
        std::string string_block_indent;
445
339M
        std::string string_block_term_indent;
446
447
339M
        unsigned new_lines, indent;
448
339M
        lex_ws(c, new_lines, indent, line_start, line_number);
449
450
        // If it's the end of the file, discard final whitespace.
451
339M
        if (*c == '\0')
452
23.7k
            break;
453
454
339M
        if (new_lines > 0) {
455
            // Otherwise store whitespace in fodder.
456
34.6M
            unsigned blanks = new_lines - 1;
457
34.6M
            fodder.emplace_back(FodderElement::LINE_END, blanks, indent, EMPTY);
458
34.6M
            fresh_line = true;
459
34.6M
        }
460
461
339M
        Location begin(line_number, c - line_start + 1);
462
463
339M
        switch (*c) {
464
            // The following operators should never be combined with subsequent symbols.
465
1.43M
            case '{':
466
1.43M
                kind = Token::BRACE_L;
467
1.43M
                c++;
468
1.43M
                break;
469
470
1.41M
            case '}':
471
1.41M
                kind = Token::BRACE_R;
472
1.41M
                c++;
473
1.41M
                break;
474
475
7.34M
            case '[':
476
7.34M
                kind = Token::BRACKET_L;
477
7.34M
                c++;
478
7.34M
                break;
479
480
7.31M
            case ']':
481
7.31M
                kind = Token::BRACKET_R;
482
7.31M
                c++;
483
7.31M
                break;
484
485
30.8M
            case ',':
486
30.8M
                kind = Token::COMMA;
487
30.8M
                c++;
488
30.8M
                break;
489
490
16.5M
            case '.':
491
16.5M
                kind = Token::DOT;
492
16.5M
                c++;
493
16.5M
                break;
494
495
25.0M
            case '(':
496
25.0M
                kind = Token::PAREN_L;
497
25.0M
                c++;
498
25.0M
                break;
499
500
25.0M
            case ')':
501
25.0M
                kind = Token::PAREN_R;
502
25.0M
                c++;
503
25.0M
                break;
504
505
6.62M
            case ';':
506
6.62M
                kind = Token::SEMICOLON;
507
6.62M
                c++;
508
6.62M
                break;
509
510
            // Numeric literals.
511
8.02M
            case '0':
512
12.3M
            case '1':
513
13.2M
            case '2':
514
14.0M
            case '3':
515
15.3M
            case '4':
516
15.5M
            case '5':
517
16.3M
            case '6':
518
16.7M
            case '7':
519
18.5M
            case '8':
520
19.7M
            case '9':
521
19.7M
                kind = Token::NUMBER;
522
19.7M
                data = lex_number(c, filename, begin);
523
19.7M
                break;
524
525
            // UString literals.
526
320k
            case '"': {
527
320k
                c++;
528
82.2M
                for (;; ++c) {
529
82.2M
                    if (*c == '\0') {
530
88
                        throw StaticError(filename, begin, "unterminated string");
531
88
                    }
532
82.2M
                    if (*c == '"') {
533
320k
                        break;
534
320k
                    }
535
81.8M
                    if (*c == '\\' && *(c + 1) != '\0') {
536
199k
                        data += *c;
537
199k
                        ++c;
538
199k
                    }
539
81.8M
                    if (*c == '\n') {
540
                        // Maintain line/column counters.
541
7.53M
                        line_number++;
542
7.53M
                        line_start = c + 1;
543
7.53M
                    }
544
81.8M
                    data += *c;
545
81.8M
                }
546
320k
                c++;  // Advance beyond the ".
547
320k
                kind = Token::STRING_DOUBLE;
548
320k
            } break;
549
550
            // UString literals.
551
10.8M
            case '\'': {
552
10.8M
                c++;
553
142M
                for (;; ++c) {
554
142M
                    if (*c == '\0') {
555
82
                        throw StaticError(filename, begin, "unterminated string");
556
82
                    }
557
142M
                    if (*c == '\'') {
558
10.8M
                        break;
559
10.8M
                    }
560
131M
                    if (*c == '\\' && *(c + 1) != '\0') {
561
1.18M
                        data += *c;
562
1.18M
                        ++c;
563
1.18M
                    }
564
131M
                    if (*c == '\n') {
565
                        // Maintain line/column counters.
566
3.81M
                        line_number++;
567
3.81M
                        line_start = c + 1;
568
3.81M
                    }
569
131M
                    data += *c;
570
131M
                }
571
10.8M
                c++;  // Advance beyond the '.
572
10.8M
                kind = Token::STRING_SINGLE;
573
10.8M
            } break;
574
575
            // Verbatim string literals.
576
            // ' and " quoting is interpreted here, unlike non-verbatim strings
577
            // where it is done later by jsonnet_string_unescape.  This is OK
578
            // in this case because no information is lost by resoving the
579
            // repeated quote into a single quote, so we can go back to the
580
            // original form in the formatter.
581
10.6k
            case '@': {
582
10.6k
                c++;
583
10.6k
                if (*c != '"' && *c != '\'') {
584
37
                    std::stringstream ss;
585
37
                    ss << "couldn't lex verbatim string, junk after '@': " << *c;
586
37
                    throw StaticError(filename, begin, ss.str());
587
37
                }
588
10.6k
                const char quot = *c;
589
10.6k
                c++;  // Advance beyond the opening quote.
590
454k
                for (;; ++c) {
591
454k
                    if (*c == '\0') {
592
75
                        throw StaticError(filename, begin, "unterminated verbatim string");
593
75
                    }
594
454k
                    if (*c == quot) {
595
13.4k
                        if (*(c + 1) == quot) {
596
2.95k
                            c++;
597
10.5k
                        } else {
598
10.5k
                            break;
599
10.5k
                        }
600
13.4k
                    }
601
444k
                    data += *c;
602
444k
                }
603
10.5k
                c++;  // Advance beyond the closing quote.
604
10.5k
                if (quot == '"') {
605
7.62k
                    kind = Token::VERBATIM_STRING_DOUBLE;
606
7.62k
                } else {
607
2.92k
                    kind = Token::VERBATIM_STRING_SINGLE;
608
2.92k
                }
609
10.5k
            } break;
610
611
            // Keywords
612
186M
            default:
613
186M
                if (is_identifier_first(*c)) {
614
138M
                    std::string id;
615
760M
                    for (; is_identifier(*c); ++c)
616
622M
                        id += *c;
617
138M
                    kind = lex_get_keyword_kind(id);
618
138M
                    data = id;
619
620
138M
                } else if (is_symbol(*c) || *c == '#') {
621
                    // Single line C++ and Python style comments.
622
48.2M
                    if (*c == '#' || (*c == '/' && *(c + 1) == '/')) {
623
10.7M
                        std::vector<std::string> comment(1);
624
10.7M
                        unsigned blanks;
625
10.7M
                        unsigned indent;
626
10.7M
                        lex_until_newline(c, comment[0], blanks, indent, line_start, line_number);
627
10.7M
                        auto kind = fresh_line ? FodderElement::PARAGRAPH : FodderElement::LINE_END;
628
10.7M
                        fodder.emplace_back(kind, blanks, indent, comment);
629
10.7M
                        fresh_line = true;
630
10.7M
                        continue;  // We've not got a token, just fodder, so keep scanning.
631
10.7M
                    }
632
633
                    // Multi-line C style comment.
634
37.4M
                    if (*c == '/' && *(c + 1) == '*') {
635
1.79M
                        unsigned margin = c - line_start;
636
637
1.79M
                        const char *initial_c = c;
638
1.79M
                        c += 2;  // Avoid matching /*/: skip the /* before starting the search for
639
                                 // */.
640
641
135M
                        while (!(*c == '*' && *(c + 1) == '/')) {
642
133M
                            if (*c == '\0') {
643
222
                                auto msg = "multi-line comment has no terminating */.";
644
222
                                throw StaticError(filename, begin, msg);
645
222
                            }
646
133M
                            if (*c == '\n') {
647
                                // Just keep track of the line / column counters.
648
19.1M
                                line_number++;
649
19.1M
                                line_start = c + 1;
650
19.1M
                            }
651
133M
                            ++c;
652
133M
                        }
653
1.79M
                        c += 2;  // Move the pointer to the char after the closing '/'.
654
655
1.79M
                        std::string comment(initial_c,
656
1.79M
                                            c - initial_c);  // Includes the "/*" and "*/".
657
658
                        // Lex whitespace after comment
659
1.79M
                        unsigned new_lines_after, indent_after;
660
1.79M
                        lex_ws(c, new_lines_after, indent_after, line_start, line_number);
661
1.79M
                        std::vector<std::string> lines;
662
1.79M
                        if (comment.find('\n') >= comment.length()) {
663
                            // Comment looks like /* foo */
664
1.26M
                            lines.push_back(comment);
665
1.26M
                            fodder.emplace_back(FodderElement::INTERSTITIAL, 0, 0, lines);
666
1.26M
                            if (new_lines_after > 0) {
667
1.20M
                                fodder.emplace_back(FodderElement::LINE_END,
668
1.20M
                                                    new_lines_after - 1,
669
1.20M
                                                    indent_after,
670
1.20M
                                                    EMPTY);
671
1.20M
                                fresh_line = true;
672
1.20M
                            }
673
1.26M
                        } else {
674
524k
                            lines = line_split(comment, margin);
675
524k
                            assert(lines[0][0] == '/');
676
                            // Little hack to support PARAGRAPHs with * down the LHS:
677
                            // Add a space to lines that start with a '*'
678
524k
                            bool all_star = true;
679
19.6M
                            for (auto &l : lines) {
680
19.6M
                                if (l[0] != '*')
681
19.4M
                                    all_star = false;
682
19.6M
                            }
683
524k
                            if (all_star) {
684
0
                                for (auto &l : lines) {
685
0
                                    if (l[0] == '*')
686
0
                                        l = " " + l;
687
0
                                }
688
0
                            }
689
524k
                            if (new_lines_after == 0) {
690
                                // Ensure a line end after the paragraph.
691
26.9k
                                new_lines_after = 1;
692
26.9k
                                indent_after = 0;
693
26.9k
                            }
694
524k
                            fodder_push_back(fodder,
695
524k
                                             FodderElement(FodderElement::PARAGRAPH,
696
524k
                                                           new_lines_after - 1,
697
524k
                                                           indent_after,
698
524k
                                                           lines));
699
524k
                            fresh_line = true;
700
524k
                        }
701
1.79M
                        continue;  // We've not got a token, just fodder, so keep scanning.
702
1.79M
                    }
703
704
                    // Text block
705
35.6M
                    if (*c == '|' && *(c + 1) == '|' && *(c + 2) == '|') {
706
18.1k
                        c += 3;  // Skip the "|||".
707
708
18.1k
                        bool chomp_trailing_nl = false;
709
18.1k
                        if (*c == '-') {
710
1.07k
                            chomp_trailing_nl = true;
711
1.07k
                            c++;
712
1.07k
                        }
713
714
21.6k
                        while (is_horz_ws(*c)) ++c;  // Chomp whitespace at end of line.
715
18.1k
                        if (*c != '\n') {
716
112
                            auto msg = "text block syntax requires new line after |||.";
717
112
                            throw StaticError(filename, begin, msg);
718
112
                        }
719
18.0k
                        std::stringstream block;
720
18.0k
                        c++;  // Skip the "\n"
721
18.0k
                        line_number++;
722
                        // Skip any blank lines at the beginning of the block.
723
22.4k
                        while (*c == '\n') {
724
4.48k
                            line_number++;
725
4.48k
                            ++c;
726
4.48k
                            block << '\n';
727
4.48k
                        }
728
18.0k
                        line_start = c;
729
18.0k
                        const char *first_line = c;
730
18.0k
                        int ws_chars = whitespace_check(first_line, c);
731
18.0k
                        string_block_indent = std::string(first_line, ws_chars);
732
18.0k
                        if (ws_chars == 0) {
733
76
                            auto msg = "text block's first line must start with whitespace.";
734
76
                            throw StaticError(filename, begin, msg);
735
76
                        }
736
22.5k
                        while (true) {
737
22.5k
                            assert(ws_chars > 0);
738
                            // Read up to the \n
739
12.9M
                            for (c = &c[ws_chars]; *c != '\n'; ++c) {
740
12.9M
                                if (*c == '\0')
741
173
                                    throw StaticError(filename, begin, "unexpected EOF");
742
12.9M
                                block << *c;
743
12.9M
                            }
744
                            // Add the \n
745
22.3k
                            block << '\n';
746
22.3k
                            ++c;
747
22.3k
                            line_number++;
748
22.3k
                            line_start = c;
749
                            // Skip any blank lines
750
24.9k
                            while (*c == '\n') {
751
2.56k
                                line_number++;
752
2.56k
                                ++c;
753
2.56k
                                block << '\n';
754
2.56k
                            }
755
                            // Examine next line
756
22.3k
                            ws_chars = whitespace_check(first_line, c);
757
22.3k
                            if (ws_chars == 0) {
758
                                // End of text block
759
                                // Skip over any whitespace
760
73.2k
                                while (*c == ' ' || *c == '\t') {
761
55.4k
                                    string_block_term_indent += *c;
762
55.4k
                                    ++c;
763
55.4k
                                }
764
                                // Expect |||
765
17.7k
                                if (!(*c == '|' && *(c + 1) == '|' && *(c + 2) == '|')) {
766
229
                                    auto msg = "text block not terminated with |||";
767
229
                                    throw StaticError(filename, begin, msg);
768
229
                                }
769
17.5k
                                c += 3;  // Leave after the last |
770
17.5k
                                data = block.str();
771
17.5k
                                kind = Token::STRING_BLOCK;
772
17.5k
                                if (chomp_trailing_nl) {
773
1.05k
                                    assert(data.back() == '\n');
774
1.05k
                                    data.pop_back();
775
1.05k
                                }
776
17.5k
                                break;  // Out of the while loop.
777
17.5k
                            }
778
22.3k
                        }
779
780
17.5k
                        break;  // Out of the switch.
781
17.9k
                    }
782
783
35.6M
                    const char *operator_begin = c;
784
117M
                    for (; is_symbol(*c); ++c) {
785
                        // Not allowed // in operators
786
81.5M
                        if (*c == '/' && *(c + 1) == '/')
787
1.29k
                            break;
788
                        // Not allowed /* in operators
789
81.5M
                        if (*c == '/' && *(c + 1) == '*')
790
1.59k
                            break;
791
                        // Not allowed ||| in operators
792
81.5M
                        if (*c == '|' && *(c + 1) == '|' && *(c + 2) == '|')
793
2.03k
                            break;
794
81.5M
                    }
795
                    // Not allowed to end with a + - ~ ! unless a single char.
796
                    // So, wind it back if we need to (but not too far).
797
46.3M
                    while (c > operator_begin + 1 && !allowed_at_end_of_operator(*(c - 1))) {
798
10.6M
                        c--;
799
10.6M
                    }
800
35.6M
                    data += std::string(operator_begin, c);
801
35.6M
                    if (data == "$") {
802
107k
                        kind = Token::DOLLAR;
803
107k
                        data = "";
804
35.5M
                    } else {
805
35.5M
                        kind = Token::OPERATOR;
806
35.5M
                    }
807
35.6M
                } else {
808
411
                    std::stringstream ss;
809
411
                    ss << "Could not lex the character ";
810
411
                    auto uc = (unsigned char)(*c);
811
411
                    if (*c < 32)
812
374
                        ss << "code " << unsigned(uc);
813
37
                    else
814
37
                        ss << "'" << *c << "'";
815
411
                    throw StaticError(filename, begin, ss.str());
816
411
                }
817
339M
        }
818
819
        // Ensure that a bug in the above code does not cause an infinite memory consuming loop due
820
        // to pushing empty tokens.
821
326M
        if (c == original_c) {
822
0
            throw StaticError(filename, begin, "internal lexing error:  pointer did not advance");
823
0
        }
824
825
326M
        Location end(line_number, (c + 1) - line_start);
826
326M
        r.emplace_back(kind,
827
326M
                       fodder,
828
326M
                       data,
829
326M
                       string_block_indent,
830
326M
                       string_block_term_indent,
831
326M
                       LocationRange(filename, begin, end));
832
326M
        fodder.clear();
833
326M
        fresh_line = false;
834
326M
    }
835
836
44.7k
    Location begin(line_number, c - line_start + 1);
837
44.7k
    Location end(line_number, (c + 1) - line_start + 1);
838
44.7k
    r.emplace_back(Token::END_OF_FILE, fodder, "", "", "", LocationRange(filename, begin, end));
839
44.7k
    return r;
840
46.3k
}
841
842
std::string jsonnet_unlex(const Tokens &tokens)
843
0
{
844
0
    std::stringstream ss;
845
0
    for (const auto &t : tokens) {
846
0
        for (const auto &f : t.fodder) {
847
0
            switch (f.kind) {
848
0
                case FodderElement::LINE_END: {
849
0
                    if (f.comment.size() > 0) {
850
0
                        ss << "LineEnd(" << f.blanks << ", " << f.indent << ", " << f.comment[0]
851
0
                           << ")\n";
852
0
                    } else {
853
0
                        ss << "LineEnd(" << f.blanks << ", " << f.indent << ")\n";
854
0
                    }
855
0
                } break;
856
857
0
                case FodderElement::INTERSTITIAL: {
858
0
                    ss << "Interstitial(" << f.comment[0] << ")\n";
859
0
                } break;
860
861
0
                case FodderElement::PARAGRAPH: {
862
0
                    ss << "Paragraph(\n";
863
0
                    for (const auto &line : f.comment) {
864
0
                        ss << "    " << line << '\n';
865
0
                    }
866
0
                    ss << ")" << f.blanks << "\n";
867
0
                } break;
868
0
            }
869
0
        }
870
0
        if (t.kind == Token::END_OF_FILE) {
871
0
            ss << "EOF\n";
872
0
            break;
873
0
        }
874
0
        if (t.kind == Token::STRING_DOUBLE) {
875
0
            ss << "\"" << t.data << "\"\n";
876
0
        } else if (t.kind == Token::STRING_SINGLE) {
877
0
            ss << "'" << t.data << "'\n";
878
0
        } else if (t.kind == Token::STRING_BLOCK) {
879
0
            ss << "|||\n";
880
0
            ss << t.stringBlockIndent;
881
0
            for (const char *cp = t.data.c_str(); *cp != '\0'; ++cp) {
882
0
                ss << *cp;
883
0
                if (*cp == '\n' && *(cp + 1) != '\n' && *(cp + 1) != '\0') {
884
0
                    ss << t.stringBlockIndent;
885
0
                }
886
0
            }
887
0
            ss << t.stringBlockTermIndent << "|||\n";
888
0
        } else {
889
0
            ss << t.data << "\n";
890
0
        }
891
0
    }
892
0
    return ss.str();
893
0
}
894
895
}  // namespace jsonnet::internal