/src/jsonnet/core/lexer.cpp
Line | Count | Source |
1 | | /* |
2 | | Copyright 2015 Google Inc. All rights reserved. |
3 | | |
4 | | Licensed under the Apache License, Version 2.0 (the "License"); |
5 | | you may not use this file except in compliance with the License. |
6 | | You may obtain a copy of the License at |
7 | | |
8 | | http://www.apache.org/licenses/LICENSE-2.0 |
9 | | |
10 | | Unless required by applicable law or agreed to in writing, software |
11 | | distributed under the License is distributed on an "AS IS" BASIS, |
12 | | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | | See the License for the specific language governing permissions and |
14 | | limitations under the License. |
15 | | */ |
16 | | |
17 | | #include <cassert> |
18 | | |
19 | | #include <map> |
20 | | #include <sstream> |
21 | | #include <string> |
22 | | |
23 | | #include "lexer.h" |
24 | | #include "static_error.h" |
25 | | #include "unicode.h" |
26 | | |
27 | | namespace jsonnet::internal { |
28 | | |
29 | | static const std::vector<std::string> EMPTY; |
30 | | |
31 | | /** Is the char whitespace (excluding \n). */ |
32 | | static bool is_horz_ws(char c) |
33 | 915M | { |
34 | 915M | return c == ' ' || c == '\t' || c == '\r'; |
35 | 915M | } |
36 | | |
37 | | /** Is the char whitespace. */ |
38 | | static bool is_ws(char c) |
39 | 825M | { |
40 | 825M | return c == '\n' || is_horz_ws(c); |
41 | 825M | } |
42 | | |
43 | | /** Strip whitespace from both ends of a string, but only up to margin on the left hand side. */ |
44 | | static std::string strip_ws(const std::string &s, unsigned margin) |
45 | 18.6M | { |
46 | 18.6M | if (s.size() == 0) |
47 | 14.7M | return s; // Avoid underflow below. |
48 | 3.88M | size_t i = 0; |
49 | 9.70M | while (i < s.length() && is_horz_ws(s[i]) && i < margin) |
50 | 5.81M | i++; |
51 | 3.88M | size_t j = s.size(); |
52 | 7.33M | while (j > i && is_horz_ws(s[j - 1])) { |
53 | 3.44M | j--; |
54 | 3.44M | } |
55 | 3.88M | return std::string(&s[i], &s[j]); |
56 | 18.6M | } |
57 | | |
58 | | /** Split a string by \n and also strip left (up to margin) & right whitespace from each line. */ |
59 | | static std::vector<std::string> line_split(const std::string &s, unsigned margin) |
60 | 428k | { |
61 | 428k | std::vector<std::string> ret; |
62 | 428k | std::stringstream ss; |
63 | 130M | for (size_t i = 0; i < s.length(); ++i) { |
64 | 129M | if (s[i] == '\n') { |
65 | 18.2M | ret.emplace_back(strip_ws(ss.str(), margin)); |
66 | 18.2M | ss.str(""); |
67 | 111M | } else { |
68 | 111M | ss << s[i]; |
69 | 111M | } |
70 | 129M | } |
71 | 428k | ret.emplace_back(strip_ws(ss.str(), margin)); |
72 | 428k | return ret; |
73 | 428k | } |
74 | | |
75 | | /** Consume whitespace. |
76 | | * |
77 | | * Return number of \n and number of spaces after last \n. Convert \t to spaces. |
78 | | */ |
79 | | static void lex_ws(const char *&c, unsigned &new_lines, unsigned &indent, const char *&line_start, |
80 | | unsigned long &line_number) |
81 | 362M | { |
82 | 362M | indent = 0; |
83 | 362M | new_lines = 0; |
84 | 825M | for (; *c != '\0' && is_ws(*c); c++) { |
85 | 462M | switch (*c) { |
86 | 865k | case '\r': |
87 | | // Ignore. |
88 | 865k | break; |
89 | | |
90 | 54.0M | case '\n': |
91 | 54.0M | indent = 0; |
92 | 54.0M | new_lines++; |
93 | 54.0M | line_number++; |
94 | 54.0M | line_start = c + 1; |
95 | 54.0M | break; |
96 | | |
97 | 407M | case ' ': indent += 1; break; |
98 | | |
99 | | // This only works for \t at the beginning of lines, but we strip it everywhere else |
100 | | // anyway. The only case where this will cause a problem is spaces followed by \t |
101 | | // at the beginning of a line. However that is rare, ill-advised, and if re-indentation |
102 | | // is enabled it will be fixed later. |
103 | 59.3k | case '\t': indent += 8; break; |
104 | 462M | } |
105 | 462M | } |
106 | 362M | } |
107 | | |
108 | | /** |
109 | | # Consume all text until the end of the line, return number of newlines after that and indent |
110 | | */ |
111 | | static void lex_until_newline(const char *&c, std::string &text, unsigned &blanks, unsigned &indent, |
112 | | const char *&line_start, unsigned long &line_number) |
113 | 11.2M | { |
114 | 11.2M | const char *original_c = c; |
115 | 11.2M | const char *last_non_space = c; |
116 | 138M | for (; *c != '\0' && *c != '\n'; c++) { |
117 | 127M | if (!is_horz_ws(*c)) |
118 | 109M | last_non_space = c; |
119 | 127M | } |
120 | 11.2M | text = std::string(original_c, last_non_space - original_c + 1); |
121 | | // Consume subsequent whitespace including the '\n'. |
122 | 11.2M | unsigned new_lines; |
123 | 11.2M | lex_ws(c, new_lines, indent, line_start, line_number); |
124 | 11.2M | blanks = new_lines == 0 ? 0 : new_lines - 1; |
125 | 11.2M | } |
126 | | |
127 | | static bool is_upper(char c) |
128 | 983M | { |
129 | 983M | return c >= 'A' && c <= 'Z'; |
130 | 983M | } |
131 | | |
132 | | static bool is_lower(char c) |
133 | 966M | { |
134 | 966M | return c >= 'a' && c <= 'z'; |
135 | 966M | } |
136 | | |
137 | | static bool is_number(char c) |
138 | 174M | { |
139 | 174M | return c >= '0' && c <= '9'; |
140 | 174M | } |
141 | | |
142 | | static bool is_identifier_first(char c) |
143 | 983M | { |
144 | 983M | return is_upper(c) || is_lower(c) || c == '_'; |
145 | 983M | } |
146 | | |
147 | | static bool is_identifier(char c) |
148 | 789M | { |
149 | 789M | return is_identifier_first(c) || is_number(c); |
150 | 789M | } |
151 | | |
152 | | static bool is_symbol(char c) |
153 | 169M | { |
154 | 169M | switch (c) { |
155 | 3.74M | case '!': |
156 | 4.34M | case '$': |
157 | 21.8M | case ':': |
158 | 22.9M | case '~': |
159 | 50.1M | case '+': |
160 | 53.3M | case '-': |
161 | 56.5M | case '&': |
162 | 58.3M | case '|': |
163 | 58.3M | case '^': |
164 | 92.4M | case '=': |
165 | 95.4M | case '<': |
166 | 99.4M | case '>': |
167 | 118M | case '*': |
168 | 122M | case '/': |
169 | 123M | case '%': return true; |
170 | 169M | } |
171 | 45.9M | return false; |
172 | 169M | } |
173 | | |
174 | 20.9M | bool allowed_at_end_of_operator(char c) { |
175 | 20.9M | switch (c) { |
176 | 7.88M | case '+': |
177 | 8.44M | case '-': |
178 | 9.52M | case '~': |
179 | 10.3M | case '!': |
180 | 10.6M | case '$': return false; |
181 | 20.9M | } |
182 | 10.3M | return true; |
183 | 20.9M | } |
184 | | |
185 | | static const std::map<std::string, Token::Kind> keywords = { |
186 | | {"assert", Token::ASSERT}, |
187 | | {"else", Token::ELSE}, |
188 | | {"error", Token::ERROR}, |
189 | | {"false", Token::FALSE}, |
190 | | {"for", Token::FOR}, |
191 | | {"function", Token::FUNCTION}, |
192 | | {"if", Token::IF}, |
193 | | {"import", Token::IMPORT}, |
194 | | {"importstr", Token::IMPORTSTR}, |
195 | | {"importbin", Token::IMPORTBIN}, |
196 | | {"in", Token::IN}, |
197 | | {"local", Token::LOCAL}, |
198 | | {"null", Token::NULL_LIT}, |
199 | | {"self", Token::SELF}, |
200 | | {"super", Token::SUPER}, |
201 | | {"tailstrict", Token::TAILSTRICT}, |
202 | | {"then", Token::THEN}, |
203 | | {"true", Token::TRUE}, |
204 | | }; |
205 | | |
206 | | Token::Kind lex_get_keyword_kind(const std::string &identifier) |
207 | 143M | { |
208 | 143M | auto it = keywords.find(identifier); |
209 | 143M | if (it == keywords.end()) |
210 | 105M | return Token::IDENTIFIER; |
211 | 38.2M | return it->second; |
212 | 143M | } |
213 | | |
214 | | std::string lex_number(const char *&c, const std::string &filename, const Location &begin) |
215 | 19.7M | { |
216 | | // This function should be understood with reference to the linked image: |
217 | | // https://www.json.org/img/number.png |
218 | | |
219 | | // Note, we deviate from the json.org documentation as follows: |
220 | | // There is no reason to lex negative numbers as atomic tokens, it is better to parse them |
221 | | // as a unary operator combined with a numeric literal. This avoids x-1 being tokenized as |
222 | | // <identifier> <number> instead of the intended <identifier> <binop> <number>. |
223 | | |
224 | 19.7M | enum State { |
225 | 19.7M | BEGIN, |
226 | 19.7M | AFTER_ZERO, |
227 | 19.7M | AFTER_ONE_TO_NINE, |
228 | 19.7M | AFTER_DOT, |
229 | 19.7M | AFTER_DIGIT, |
230 | 19.7M | AFTER_E, |
231 | 19.7M | AFTER_EXP_SIGN, |
232 | 19.7M | AFTER_EXP_DIGIT |
233 | 19.7M | } state; |
234 | | |
235 | 19.7M | std::string r; |
236 | | |
237 | 19.7M | state = BEGIN; |
238 | 45.0M | while (true) { |
239 | 45.0M | switch (state) { |
240 | 19.7M | case BEGIN: |
241 | 19.7M | switch (*c) { |
242 | 8.28M | case '0': state = AFTER_ZERO; break; |
243 | | |
244 | 4.46M | case '1': |
245 | 5.34M | case '2': |
246 | 6.18M | case '3': |
247 | 7.25M | case '4': |
248 | 7.47M | case '5': |
249 | 8.31M | case '6': |
250 | 8.73M | case '7': |
251 | 10.2M | case '8': |
252 | 11.4M | case '9': state = AFTER_ONE_TO_NINE; break; |
253 | | |
254 | 0 | default: throw StaticError(filename, begin, "couldn't lex number"); |
255 | 19.7M | } |
256 | 19.7M | break; |
257 | | |
258 | 19.7M | case AFTER_ZERO: |
259 | 8.28M | switch (*c) { |
260 | 49.6k | case '.': state = AFTER_DOT; break; |
261 | | |
262 | 2.35k | case 'e': |
263 | 5.66k | case 'E': state = AFTER_E; break; |
264 | | |
265 | 8.23M | default: goto end; |
266 | 8.28M | } |
267 | 55.3k | break; |
268 | | |
269 | 14.4M | case AFTER_ONE_TO_NINE: |
270 | 14.4M | switch (*c) { |
271 | 31.5k | case '.': state = AFTER_DOT; break; |
272 | | |
273 | 2.47k | case 'e': |
274 | 275k | case 'E': state = AFTER_E; break; |
275 | | |
276 | 1.07M | case '0': |
277 | 1.12M | case '1': |
278 | 1.39M | case '2': |
279 | 1.49M | case '3': |
280 | 1.60M | case '4': |
281 | 1.84M | case '5': |
282 | 2.00M | case '6': |
283 | 2.07M | case '7': |
284 | 2.15M | case '8': |
285 | 2.99M | case '9': state = AFTER_ONE_TO_NINE; break; |
286 | | |
287 | 11.1M | default: goto end; |
288 | 14.4M | } |
289 | 3.29M | break; |
290 | | |
291 | 3.29M | case AFTER_DOT: |
292 | 81.2k | switch (*c) { |
293 | 3.46k | case '0': |
294 | 28.7k | case '1': |
295 | 29.2k | case '2': |
296 | 30.1k | case '3': |
297 | 31.0k | case '4': |
298 | 79.2k | case '5': |
299 | 80.1k | case '6': |
300 | 80.1k | case '7': |
301 | 80.6k | case '8': |
302 | 81.1k | case '9': state = AFTER_DIGIT; break; |
303 | | |
304 | 29 | default: { |
305 | 29 | std::stringstream ss; |
306 | 29 | ss << "couldn't lex number, junk after decimal point: " << *c; |
307 | 29 | throw StaticError(filename, begin, ss.str()); |
308 | 80.6k | } |
309 | 81.2k | } |
310 | 81.1k | break; |
311 | | |
312 | 1.65M | case AFTER_DIGIT: |
313 | 1.65M | switch (*c) { |
314 | 2.57k | case 'e': |
315 | 3.82k | case 'E': state = AFTER_E; break; |
316 | | |
317 | 1.10M | case '0': |
318 | 1.18M | case '1': |
319 | 1.21M | case '2': |
320 | 1.26M | case '3': |
321 | 1.29M | case '4': |
322 | 1.37M | case '5': |
323 | 1.43M | case '6': |
324 | 1.46M | case '7': |
325 | 1.49M | case '8': |
326 | 1.57M | case '9': state = AFTER_DIGIT; break; |
327 | | |
328 | 77.3k | default: goto end; |
329 | 1.65M | } |
330 | 1.57M | break; |
331 | | |
332 | 1.57M | case AFTER_E: |
333 | 284k | switch (*c) { |
334 | 1.69k | case '+': |
335 | 4.49k | case '-': state = AFTER_EXP_SIGN; break; |
336 | | |
337 | 3.01k | case '0': |
338 | 6.54k | case '1': |
339 | 275k | case '2': |
340 | 277k | case '3': |
341 | 278k | case '4': |
342 | 279k | case '5': |
343 | 279k | case '6': |
344 | 279k | case '7': |
345 | 279k | case '8': |
346 | 280k | case '9': state = AFTER_EXP_DIGIT; break; |
347 | | |
348 | 82 | default: { |
349 | 82 | std::stringstream ss; |
350 | 82 | ss << "couldn't lex number, junk after 'E': " << *c; |
351 | 82 | throw StaticError(filename, begin, ss.str()); |
352 | 279k | } |
353 | 284k | } |
354 | 284k | break; |
355 | | |
356 | 284k | case AFTER_EXP_SIGN: |
357 | 4.49k | switch (*c) { |
358 | 612 | case '0': |
359 | 883 | case '1': |
360 | 982 | case '2': |
361 | 3.98k | case '3': |
362 | 4.37k | case '4': |
363 | 4.38k | case '5': |
364 | 4.38k | case '6': |
365 | 4.39k | case '7': |
366 | 4.40k | case '8': |
367 | 4.47k | case '9': state = AFTER_EXP_DIGIT; break; |
368 | | |
369 | 17 | default: { |
370 | 17 | std::stringstream ss; |
371 | 17 | ss << "couldn't lex number, junk after exponent sign: " << *c; |
372 | 17 | throw StaticError(filename, begin, ss.str()); |
373 | 4.40k | } |
374 | 4.49k | } |
375 | 4.47k | break; |
376 | | |
377 | 573k | case AFTER_EXP_DIGIT: |
378 | 573k | switch (*c) { |
379 | 4.89k | case '0': |
380 | 7.28k | case '1': |
381 | 9.80k | case '2': |
382 | 277k | case '3': |
383 | 279k | case '4': |
384 | 281k | case '5': |
385 | 282k | case '6': |
386 | 284k | case '7': |
387 | 286k | case '8': |
388 | 288k | case '9': state = AFTER_EXP_DIGIT; break; |
389 | | |
390 | 284k | default: goto end; |
391 | 573k | } |
392 | 288k | break; |
393 | 45.0M | } |
394 | 25.3M | r += *c; |
395 | 25.3M | c++; |
396 | 25.3M | } |
397 | 19.7M | end: |
398 | 19.7M | return r; |
399 | 19.7M | } |
400 | | |
401 | | // Check that b has at least the same whitespace prefix as a and returns the amount of this |
402 | | // whitespace, otherwise returns 0. If a has no whitespace prefix than return 0. |
403 | | static int whitespace_check(const char *a, const char *b) |
404 | 39.8k | { |
405 | 39.8k | int i = 0; |
406 | 738k | while (a[i] == ' ' || a[i] == '\t') { |
407 | 715k | if (b[i] != a[i]) |
408 | 17.2k | return 0; |
409 | 698k | i++; |
410 | 698k | } |
411 | 22.5k | return i; |
412 | 39.8k | } |
413 | | |
414 | | /* |
415 | | static void add_whitespace(Fodder &fodder, const char *s, size_t n) |
416 | | { |
417 | | std::string ws(s, n); |
418 | | if (fodder.size() == 0 || fodder.back().kind != FodderElement::WHITESPACE) { |
419 | | fodder.emplace_back(FodderElement::WHITESPACE, ws); |
420 | | } else { |
421 | | fodder.back().data += ws; |
422 | | } |
423 | | } |
424 | | */ |
425 | | |
426 | | Tokens jsonnet_lex(const std::string &filename, const char *input) |
427 | 47.8k | { |
428 | 47.8k | unsigned long line_number = 1; |
429 | 47.8k | const char *line_start = input; |
430 | | |
431 | 47.8k | Tokens r; |
432 | | |
433 | 47.8k | const char *c = input; |
434 | | |
435 | 47.8k | Fodder fodder; |
436 | 47.8k | bool fresh_line = true; // Are we tokenizing from the beginning of a new line? |
437 | | |
438 | 350M | while (*c != '\0') { |
439 | | // Used to ensure we have actually advanced the pointer by the end of the iteration. |
440 | 350M | const char *original_c = c; |
441 | | |
442 | 350M | Token::Kind kind; |
443 | 350M | std::string data; |
444 | 350M | std::string string_block_indent; |
445 | 350M | std::string string_block_term_indent; |
446 | | |
447 | 350M | unsigned new_lines, indent; |
448 | 350M | lex_ws(c, new_lines, indent, line_start, line_number); |
449 | | |
450 | | // If it's the end of the file, discard final whitespace. |
451 | 350M | if (*c == '\0') |
452 | 24.6k | break; |
453 | | |
454 | 350M | if (new_lines > 0) { |
455 | | // Otherwise store whitespace in fodder. |
456 | 35.8M | unsigned blanks = new_lines - 1; |
457 | 35.8M | fodder.emplace_back(FodderElement::LINE_END, blanks, indent, EMPTY); |
458 | 35.8M | fresh_line = true; |
459 | 35.8M | } |
460 | | |
461 | 350M | Location begin(line_number, c - line_start + 1); |
462 | | |
463 | 350M | switch (*c) { |
464 | | // The following operators should never be combined with subsequent symbols. |
465 | 1.48M | case '{': |
466 | 1.48M | kind = Token::BRACE_L; |
467 | 1.48M | c++; |
468 | 1.48M | break; |
469 | | |
470 | 1.46M | case '}': |
471 | 1.46M | kind = Token::BRACE_R; |
472 | 1.46M | c++; |
473 | 1.46M | break; |
474 | | |
475 | 7.63M | case '[': |
476 | 7.63M | kind = Token::BRACKET_L; |
477 | 7.63M | c++; |
478 | 7.63M | break; |
479 | | |
480 | 7.59M | case ']': |
481 | 7.59M | kind = Token::BRACKET_R; |
482 | 7.59M | c++; |
483 | 7.59M | break; |
484 | | |
485 | 31.2M | case ',': |
486 | 31.2M | kind = Token::COMMA; |
487 | 31.2M | c++; |
488 | 31.2M | break; |
489 | | |
490 | 17.2M | case '.': |
491 | 17.2M | kind = Token::DOT; |
492 | 17.2M | c++; |
493 | 17.2M | break; |
494 | | |
495 | 25.9M | case '(': |
496 | 25.9M | kind = Token::PAREN_L; |
497 | 25.9M | c++; |
498 | 25.9M | break; |
499 | | |
500 | 25.9M | case ')': |
501 | 25.9M | kind = Token::PAREN_R; |
502 | 25.9M | c++; |
503 | 25.9M | break; |
504 | | |
505 | 6.76M | case ';': |
506 | 6.76M | kind = Token::SEMICOLON; |
507 | 6.76M | c++; |
508 | 6.76M | break; |
509 | | |
510 | | // Numeric literals. |
511 | 8.28M | case '0': |
512 | 12.7M | case '1': |
513 | 13.6M | case '2': |
514 | 14.4M | case '3': |
515 | 15.5M | case '4': |
516 | 15.7M | case '5': |
517 | 16.5M | case '6': |
518 | 17.0M | case '7': |
519 | 18.5M | case '8': |
520 | 19.7M | case '9': |
521 | 19.7M | kind = Token::NUMBER; |
522 | 19.7M | data = lex_number(c, filename, begin); |
523 | 19.7M | break; |
524 | | |
525 | | // UString literals. |
526 | 338k | case '"': { |
527 | 338k | c++; |
528 | 83.4M | for (;; ++c) { |
529 | 83.4M | if (*c == '\0') { |
530 | 95 | throw StaticError(filename, begin, "unterminated string"); |
531 | 95 | } |
532 | 83.4M | if (*c == '"') { |
533 | 338k | break; |
534 | 338k | } |
535 | 83.0M | if (*c == '\\' && *(c + 1) != '\0') { |
536 | 223k | data += *c; |
537 | 223k | ++c; |
538 | 223k | } |
539 | 83.0M | if (*c == '\n') { |
540 | | // Maintain line/column counters. |
541 | 6.83M | line_number++; |
542 | 6.83M | line_start = c + 1; |
543 | 6.83M | } |
544 | 83.0M | data += *c; |
545 | 83.0M | } |
546 | 338k | c++; // Advance beyond the ". |
547 | 338k | kind = Token::STRING_DOUBLE; |
548 | 338k | } break; |
549 | | |
550 | | // UString literals. |
551 | 11.2M | case '\'': { |
552 | 11.2M | c++; |
553 | 150M | for (;; ++c) { |
554 | 150M | if (*c == '\0') { |
555 | 75 | throw StaticError(filename, begin, "unterminated string"); |
556 | 75 | } |
557 | 150M | if (*c == '\'') { |
558 | 11.2M | break; |
559 | 11.2M | } |
560 | 138M | if (*c == '\\' && *(c + 1) != '\0') { |
561 | 1.22M | data += *c; |
562 | 1.22M | ++c; |
563 | 1.22M | } |
564 | 138M | if (*c == '\n') { |
565 | | // Maintain line/column counters. |
566 | 4.77M | line_number++; |
567 | 4.77M | line_start = c + 1; |
568 | 4.77M | } |
569 | 138M | data += *c; |
570 | 138M | } |
571 | 11.2M | c++; // Advance beyond the '. |
572 | 11.2M | kind = Token::STRING_SINGLE; |
573 | 11.2M | } break; |
574 | | |
575 | | // Verbatim string literals. |
576 | | // ' and " quoting is interpreted here, unlike non-verbatim strings |
577 | | // where it is done later by jsonnet_string_unescape. This is OK |
578 | | // in this case because no information is lost by resoving the |
579 | | // repeated quote into a single quote, so we can go back to the |
580 | | // original form in the formatter. |
581 | 10.8k | case '@': { |
582 | 10.8k | c++; |
583 | 10.8k | if (*c != '"' && *c != '\'') { |
584 | 38 | std::stringstream ss; |
585 | 38 | ss << "couldn't lex verbatim string, junk after '@': " << *c; |
586 | 38 | throw StaticError(filename, begin, ss.str()); |
587 | 38 | } |
588 | 10.8k | const char quot = *c; |
589 | 10.8k | c++; // Advance beyond the opening quote. |
590 | 420k | for (;; ++c) { |
591 | 420k | if (*c == '\0') { |
592 | 74 | throw StaticError(filename, begin, "unterminated verbatim string"); |
593 | 74 | } |
594 | 420k | if (*c == quot) { |
595 | 13.5k | if (*(c + 1) == quot) { |
596 | 2.75k | c++; |
597 | 10.7k | } else { |
598 | 10.7k | break; |
599 | 10.7k | } |
600 | 13.5k | } |
601 | 409k | data += *c; |
602 | 409k | } |
603 | 10.7k | c++; // Advance beyond the closing quote. |
604 | 10.7k | if (quot == '"') { |
605 | 7.49k | kind = Token::VERBATIM_STRING_DOUBLE; |
606 | 7.49k | } else { |
607 | 3.28k | kind = Token::VERBATIM_STRING_SINGLE; |
608 | 3.28k | } |
609 | 10.7k | } break; |
610 | | |
611 | | // Keywords |
612 | 193M | default: |
613 | 193M | if (is_identifier_first(*c)) { |
614 | 143M | std::string id; |
615 | 789M | for (; is_identifier(*c); ++c) |
616 | 646M | id += *c; |
617 | 143M | kind = lex_get_keyword_kind(id); |
618 | 143M | data = id; |
619 | | |
620 | 143M | } else if (is_symbol(*c) || *c == '#') { |
621 | | // Single line C++ and Python style comments. |
622 | 49.6M | if (*c == '#' || (*c == '/' && *(c + 1) == '/')) { |
623 | 11.2M | std::vector<std::string> comment(1); |
624 | 11.2M | unsigned blanks; |
625 | 11.2M | unsigned indent; |
626 | 11.2M | lex_until_newline(c, comment[0], blanks, indent, line_start, line_number); |
627 | 11.2M | auto kind = fresh_line ? FodderElement::PARAGRAPH : FodderElement::LINE_END; |
628 | 11.2M | fodder.emplace_back(kind, blanks, indent, comment); |
629 | 11.2M | fresh_line = true; |
630 | 11.2M | continue; // We've not got a token, just fodder, so keep scanning. |
631 | 11.2M | } |
632 | | |
633 | | // Multi-line C style comment. |
634 | 38.3M | if (*c == '/' && *(c + 1) == '*') { |
635 | 1.42M | unsigned margin = c - line_start; |
636 | | |
637 | 1.42M | const char *initial_c = c; |
638 | 1.42M | c += 2; // Avoid matching /*/: skip the /* before starting the search for |
639 | | // */. |
640 | | |
641 | 135M | while (!(*c == '*' && *(c + 1) == '/')) { |
642 | 134M | if (*c == '\0') { |
643 | 211 | auto msg = "multi-line comment has no terminating */."; |
644 | 211 | throw StaticError(filename, begin, msg); |
645 | 211 | } |
646 | 134M | if (*c == '\n') { |
647 | | // Just keep track of the line / column counters. |
648 | 18.2M | line_number++; |
649 | 18.2M | line_start = c + 1; |
650 | 18.2M | } |
651 | 134M | ++c; |
652 | 134M | } |
653 | 1.42M | c += 2; // Move the pointer to the char after the closing '/'. |
654 | | |
655 | 1.42M | std::string comment(initial_c, |
656 | 1.42M | c - initial_c); // Includes the "/*" and "*/". |
657 | | |
658 | | // Lex whitespace after comment |
659 | 1.42M | unsigned new_lines_after, indent_after; |
660 | 1.42M | lex_ws(c, new_lines_after, indent_after, line_start, line_number); |
661 | 1.42M | std::vector<std::string> lines; |
662 | 1.42M | if (comment.find('\n') >= comment.length()) { |
663 | | // Comment looks like /* foo */ |
664 | 998k | lines.push_back(comment); |
665 | 998k | fodder.emplace_back(FodderElement::INTERSTITIAL, 0, 0, lines); |
666 | 998k | if (new_lines_after > 0) { |
667 | 937k | fodder.emplace_back(FodderElement::LINE_END, |
668 | 937k | new_lines_after - 1, |
669 | 937k | indent_after, |
670 | 937k | EMPTY); |
671 | 937k | fresh_line = true; |
672 | 937k | } |
673 | 998k | } else { |
674 | 428k | lines = line_split(comment, margin); |
675 | 428k | assert(lines[0][0] == '/'); |
676 | | // Little hack to support PARAGRAPHs with * down the LHS: |
677 | | // Add a space to lines that start with a '*' |
678 | 428k | bool all_star = true; |
679 | 18.6M | for (auto &l : lines) { |
680 | 18.6M | if (l[0] != '*') |
681 | 18.4M | all_star = false; |
682 | 18.6M | } |
683 | 428k | if (all_star) { |
684 | 0 | for (auto &l : lines) { |
685 | 0 | if (l[0] == '*') |
686 | 0 | l = " " + l; |
687 | 0 | } |
688 | 0 | } |
689 | 428k | if (new_lines_after == 0) { |
690 | | // Ensure a line end after the paragraph. |
691 | 27.0k | new_lines_after = 1; |
692 | 27.0k | indent_after = 0; |
693 | 27.0k | } |
694 | 428k | fodder_push_back(fodder, |
695 | 428k | FodderElement(FodderElement::PARAGRAPH, |
696 | 428k | new_lines_after - 1, |
697 | 428k | indent_after, |
698 | 428k | lines)); |
699 | 428k | fresh_line = true; |
700 | 428k | } |
701 | 1.42M | continue; // We've not got a token, just fodder, so keep scanning. |
702 | 1.42M | } |
703 | | |
704 | | // Text block |
705 | 36.9M | if (*c == '|' && *(c + 1) == '|' && *(c + 2) == '|') { |
706 | 17.6k | c += 3; // Skip the "|||". |
707 | | |
708 | 17.6k | bool chomp_trailing_nl = false; |
709 | 17.6k | if (*c == '-') { |
710 | 1.04k | chomp_trailing_nl = true; |
711 | 1.04k | c++; |
712 | 1.04k | } |
713 | | |
714 | 21.2k | while (is_horz_ws(*c)) ++c; // Chomp whitespace at end of line. |
715 | 17.6k | if (*c != '\n') { |
716 | 111 | auto msg = "text block syntax requires new line after |||."; |
717 | 111 | throw StaticError(filename, begin, msg); |
718 | 111 | } |
719 | 17.5k | std::stringstream block; |
720 | 17.5k | c++; // Skip the "\n" |
721 | 17.5k | line_number++; |
722 | | // Skip any blank lines at the beginning of the block. |
723 | 22.7k | while (*c == '\n') { |
724 | 5.21k | line_number++; |
725 | 5.21k | ++c; |
726 | 5.21k | block << '\n'; |
727 | 5.21k | } |
728 | 17.5k | line_start = c; |
729 | 17.5k | const char *first_line = c; |
730 | 17.5k | int ws_chars = whitespace_check(first_line, c); |
731 | 17.5k | string_block_indent = std::string(first_line, ws_chars); |
732 | 17.5k | if (ws_chars == 0) { |
733 | 83 | auto msg = "text block's first line must start with whitespace."; |
734 | 83 | throw StaticError(filename, begin, msg); |
735 | 83 | } |
736 | 22.4k | while (true) { |
737 | 22.4k | assert(ws_chars > 0); |
738 | | // Read up to the \n |
739 | 14.0M | for (c = &c[ws_chars]; *c != '\n'; ++c) { |
740 | 14.0M | if (*c == '\0') |
741 | 173 | throw StaticError(filename, begin, "unexpected EOF"); |
742 | 14.0M | block << *c; |
743 | 14.0M | } |
744 | | // Add the \n |
745 | 22.2k | block << '\n'; |
746 | 22.2k | ++c; |
747 | 22.2k | line_number++; |
748 | 22.2k | line_start = c; |
749 | | // Skip any blank lines |
750 | 25.0k | while (*c == '\n') { |
751 | 2.74k | line_number++; |
752 | 2.74k | ++c; |
753 | 2.74k | block << '\n'; |
754 | 2.74k | } |
755 | | // Examine next line |
756 | 22.2k | ws_chars = whitespace_check(first_line, c); |
757 | 22.2k | if (ws_chars == 0) { |
758 | | // End of text block |
759 | | // Skip over any whitespace |
760 | 75.5k | while (*c == ' ' || *c == '\t') { |
761 | 58.2k | string_block_term_indent += *c; |
762 | 58.2k | ++c; |
763 | 58.2k | } |
764 | | // Expect ||| |
765 | 17.2k | if (!(*c == '|' && *(c + 1) == '|' && *(c + 2) == '|')) { |
766 | 240 | auto msg = "text block not terminated with |||"; |
767 | 240 | throw StaticError(filename, begin, msg); |
768 | 240 | } |
769 | 17.0k | c += 3; // Leave after the last | |
770 | 17.0k | data = block.str(); |
771 | 17.0k | kind = Token::STRING_BLOCK; |
772 | 17.0k | if (chomp_trailing_nl) { |
773 | 1.02k | assert(data.back() == '\n'); |
774 | 1.02k | data.pop_back(); |
775 | 1.02k | } |
776 | 17.0k | break; // Out of the while loop. |
777 | 17.0k | } |
778 | 22.2k | } |
779 | | |
780 | 17.0k | break; // Out of the switch. |
781 | 17.4k | } |
782 | | |
783 | 36.8M | const char *operator_begin = c; |
784 | 120M | for (; is_symbol(*c); ++c) { |
785 | | // Not allowed // in operators |
786 | 83.3M | if (*c == '/' && *(c + 1) == '/') |
787 | 1.71k | break; |
788 | | // Not allowed /* in operators |
789 | 83.3M | if (*c == '/' && *(c + 1) == '*') |
790 | 1.61k | break; |
791 | | // Not allowed ||| in operators |
792 | 83.3M | if (*c == '|' && *(c + 1) == '|' && *(c + 2) == '|') |
793 | 2.06k | break; |
794 | 83.3M | } |
795 | | // Not allowed to end with a + - ~ ! unless a single char. |
796 | | // So, wind it back if we need to (but not too far). |
797 | 47.5M | while (c > operator_begin + 1 && !allowed_at_end_of_operator(*(c - 1))) { |
798 | 10.6M | c--; |
799 | 10.6M | } |
800 | 36.8M | data += std::string(operator_begin, c); |
801 | 36.8M | if (data == "$") { |
802 | 144k | kind = Token::DOLLAR; |
803 | 144k | data = ""; |
804 | 36.7M | } else { |
805 | 36.7M | kind = Token::OPERATOR; |
806 | 36.7M | } |
807 | 36.8M | } else { |
808 | 410 | std::stringstream ss; |
809 | 410 | ss << "Could not lex the character "; |
810 | 410 | auto uc = (unsigned char)(*c); |
811 | 410 | if (*c < 32) |
812 | 374 | ss << "code " << unsigned(uc); |
813 | 36 | else |
814 | 36 | ss << "'" << *c << "'"; |
815 | 410 | throw StaticError(filename, begin, ss.str()); |
816 | 410 | } |
817 | 350M | } |
818 | | |
819 | | // Ensure that a bug in the above code does not cause an infinite memory consuming loop due |
820 | | // to pushing empty tokens. |
821 | 337M | if (c == original_c) { |
822 | 0 | throw StaticError(filename, begin, "internal lexing error: pointer did not advance"); |
823 | 0 | } |
824 | | |
825 | 337M | Location end(line_number, (c + 1) - line_start); |
826 | 337M | r.emplace_back(kind, |
827 | 337M | fodder, |
828 | 337M | data, |
829 | 337M | string_block_indent, |
830 | 337M | string_block_term_indent, |
831 | 337M | LocationRange(filename, begin, end)); |
832 | 337M | fodder.clear(); |
833 | 337M | fresh_line = false; |
834 | 337M | } |
835 | | |
836 | 46.2k | Location begin(line_number, c - line_start + 1); |
837 | 46.2k | Location end(line_number, (c + 1) - line_start + 1); |
838 | 46.2k | r.emplace_back(Token::END_OF_FILE, fodder, "", "", "", LocationRange(filename, begin, end)); |
839 | 46.2k | return r; |
840 | 47.8k | } |
841 | | |
842 | | std::string jsonnet_unlex(const Tokens &tokens) |
843 | 0 | { |
844 | 0 | std::stringstream ss; |
845 | 0 | for (const auto &t : tokens) { |
846 | 0 | for (const auto &f : t.fodder) { |
847 | 0 | switch (f.kind) { |
848 | 0 | case FodderElement::LINE_END: { |
849 | 0 | if (f.comment.size() > 0) { |
850 | 0 | ss << "LineEnd(" << f.blanks << ", " << f.indent << ", " << f.comment[0] |
851 | 0 | << ")\n"; |
852 | 0 | } else { |
853 | 0 | ss << "LineEnd(" << f.blanks << ", " << f.indent << ")\n"; |
854 | 0 | } |
855 | 0 | } break; |
856 | | |
857 | 0 | case FodderElement::INTERSTITIAL: { |
858 | 0 | ss << "Interstitial(" << f.comment[0] << ")\n"; |
859 | 0 | } break; |
860 | | |
861 | 0 | case FodderElement::PARAGRAPH: { |
862 | 0 | ss << "Paragraph(\n"; |
863 | 0 | for (const auto &line : f.comment) { |
864 | 0 | ss << " " << line << '\n'; |
865 | 0 | } |
866 | 0 | ss << ")" << f.blanks << "\n"; |
867 | 0 | } break; |
868 | 0 | } |
869 | 0 | } |
870 | 0 | if (t.kind == Token::END_OF_FILE) { |
871 | 0 | ss << "EOF\n"; |
872 | 0 | break; |
873 | 0 | } |
874 | 0 | if (t.kind == Token::STRING_DOUBLE) { |
875 | 0 | ss << "\"" << t.data << "\"\n"; |
876 | 0 | } else if (t.kind == Token::STRING_SINGLE) { |
877 | 0 | ss << "'" << t.data << "'\n"; |
878 | 0 | } else if (t.kind == Token::STRING_BLOCK) { |
879 | 0 | ss << "|||\n"; |
880 | 0 | ss << t.stringBlockIndent; |
881 | 0 | for (const char *cp = t.data.c_str(); *cp != '\0'; ++cp) { |
882 | 0 | ss << *cp; |
883 | 0 | if (*cp == '\n' && *(cp + 1) != '\n' && *(cp + 1) != '\0') { |
884 | 0 | ss << t.stringBlockIndent; |
885 | 0 | } |
886 | 0 | } |
887 | 0 | ss << t.stringBlockTermIndent << "|||\n"; |
888 | 0 | } else { |
889 | 0 | ss << t.data << "\n"; |
890 | 0 | } |
891 | 0 | } |
892 | 0 | return ss.str(); |
893 | 0 | } |
894 | | |
895 | | } // namespace jsonnet::internal |