/src/jsonnet/core/lexer.cpp
Line | Count | Source |
1 | | /* |
2 | | Copyright 2015 Google Inc. All rights reserved. |
3 | | |
4 | | Licensed under the Apache License, Version 2.0 (the "License"); |
5 | | you may not use this file except in compliance with the License. |
6 | | You may obtain a copy of the License at |
7 | | |
8 | | http://www.apache.org/licenses/LICENSE-2.0 |
9 | | |
10 | | Unless required by applicable law or agreed to in writing, software |
11 | | distributed under the License is distributed on an "AS IS" BASIS, |
12 | | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | | See the License for the specific language governing permissions and |
14 | | limitations under the License. |
15 | | */ |
16 | | |
17 | | #include <cassert> |
18 | | |
19 | | #include <map> |
20 | | #include <sstream> |
21 | | #include <string> |
22 | | |
23 | | #include "lexer.h" |
24 | | #include "static_error.h" |
25 | | #include "unicode.h" |
26 | | |
27 | | namespace jsonnet::internal { |
28 | | |
29 | | static const std::vector<std::string> EMPTY; |
30 | | |
31 | | /** Is the char whitespace (excluding \n). */ |
32 | | static bool is_horz_ws(char c) |
33 | 887M | { |
34 | 887M | return c == ' ' || c == '\t' || c == '\r'; |
35 | 887M | } |
36 | | |
37 | | /** Is the char whitespace. */ |
38 | | static bool is_ws(char c) |
39 | 798M | { |
40 | 798M | return c == '\n' || is_horz_ws(c); |
41 | 798M | } |
42 | | |
43 | | /** Strip whitespace from both ends of a string, but only up to margin on the left hand side. */ |
44 | | static std::string strip_ws(const std::string &s, unsigned margin) |
45 | 19.6M | { |
46 | 19.6M | if (s.size() == 0) |
47 | 15.0M | return s; // Avoid underflow below. |
48 | 4.68M | size_t i = 0; |
49 | 10.3M | while (i < s.length() && is_horz_ws(s[i]) && i < margin) |
50 | 5.61M | i++; |
51 | 4.68M | size_t j = s.size(); |
52 | 8.02M | while (j > i && is_horz_ws(s[j - 1])) { |
53 | 3.34M | j--; |
54 | 3.34M | } |
55 | 4.68M | return std::string(&s[i], &s[j]); |
56 | 19.6M | } |
57 | | |
58 | | /** Split a string by \n and also strip left (up to margin) & right whitespace from each line. */ |
59 | | static std::vector<std::string> line_split(const std::string &s, unsigned margin) |
60 | 524k | { |
61 | 524k | std::vector<std::string> ret; |
62 | 524k | std::stringstream ss; |
63 | 131M | for (size_t i = 0; i < s.length(); ++i) { |
64 | 130M | if (s[i] == '\n') { |
65 | 19.1M | ret.emplace_back(strip_ws(ss.str(), margin)); |
66 | 19.1M | ss.str(""); |
67 | 111M | } else { |
68 | 111M | ss << s[i]; |
69 | 111M | } |
70 | 130M | } |
71 | 524k | ret.emplace_back(strip_ws(ss.str(), margin)); |
72 | 524k | return ret; |
73 | 524k | } |
74 | | |
75 | | /** Consume whitespace. |
76 | | * |
77 | | * Return number of \n and number of spaces after last \n. Convert \t to spaces. |
78 | | */ |
79 | | static void lex_ws(const char *&c, unsigned &new_lines, unsigned &indent, const char *&line_start, |
80 | | unsigned long &line_number) |
81 | 351M | { |
82 | 351M | indent = 0; |
83 | 351M | new_lines = 0; |
84 | 798M | for (; *c != '\0' && is_ws(*c); c++) { |
85 | 446M | switch (*c) { |
86 | 1.01M | case '\r': |
87 | | // Ignore. |
88 | 1.01M | break; |
89 | | |
90 | 52.6M | case '\n': |
91 | 52.6M | indent = 0; |
92 | 52.6M | new_lines++; |
93 | 52.6M | line_number++; |
94 | 52.6M | line_start = c + 1; |
95 | 52.6M | break; |
96 | | |
97 | 392M | case ' ': indent += 1; break; |
98 | | |
99 | | // This only works for \t at the beginning of lines, but we strip it everywhere else |
100 | | // anyway. The only case where this will cause a problem is spaces followed by \t |
101 | | // at the beginning of a line. However that is rare, ill-advised, and if re-indentation |
102 | | // is enabled it will be fixed later. |
103 | 48.1k | case '\t': indent += 8; break; |
104 | 446M | } |
105 | 446M | } |
106 | 351M | } |
107 | | |
108 | | /** |
109 | | # Consume all text until the end of the line, return number of newlines after that and indent |
110 | | */ |
111 | | static void lex_until_newline(const char *&c, std::string &text, unsigned &blanks, unsigned &indent, |
112 | | const char *&line_start, unsigned long &line_number) |
113 | 10.7M | { |
114 | 10.7M | const char *original_c = c; |
115 | 10.7M | const char *last_non_space = c; |
116 | 134M | for (; *c != '\0' && *c != '\n'; c++) { |
117 | 123M | if (!is_horz_ws(*c)) |
118 | 106M | last_non_space = c; |
119 | 123M | } |
120 | 10.7M | text = std::string(original_c, last_non_space - original_c + 1); |
121 | | // Consume subsequent whitespace including the '\n'. |
122 | 10.7M | unsigned new_lines; |
123 | 10.7M | lex_ws(c, new_lines, indent, line_start, line_number); |
124 | 10.7M | blanks = new_lines == 0 ? 0 : new_lines - 1; |
125 | 10.7M | } |
126 | | |
127 | | static bool is_upper(char c) |
128 | 947M | { |
129 | 947M | return c >= 'A' && c <= 'Z'; |
130 | 947M | } |
131 | | |
132 | | static bool is_lower(char c) |
133 | 931M | { |
134 | 931M | return c >= 'a' && c <= 'z'; |
135 | 931M | } |
136 | | |
137 | | static bool is_number(char c) |
138 | 167M | { |
139 | 167M | return c >= '0' && c <= '9'; |
140 | 167M | } |
141 | | |
142 | | static bool is_identifier_first(char c) |
143 | 947M | { |
144 | 947M | return is_upper(c) || is_lower(c) || c == '_'; |
145 | 947M | } |
146 | | |
147 | | static bool is_identifier(char c) |
148 | 760M | { |
149 | 760M | return is_identifier_first(c) || is_number(c); |
150 | 760M | } |
151 | | |
152 | | static bool is_symbol(char c) |
153 | 165M | { |
154 | 165M | switch (c) { |
155 | 3.90M | case '!': |
156 | 4.37M | case '$': |
157 | 21.4M | case ':': |
158 | 22.5M | case '~': |
159 | 48.8M | case '+': |
160 | 52.0M | case '-': |
161 | 55.1M | case '&': |
162 | 56.8M | case '|': |
163 | 56.8M | case '^': |
164 | 89.5M | case '=': |
165 | 92.4M | case '<': |
166 | 96.3M | case '>': |
167 | 115M | case '*': |
168 | 119M | case '/': |
169 | 121M | case '%': return true; |
170 | 165M | } |
171 | 44.2M | return false; |
172 | 165M | } |
173 | | |
174 | 20.6M | bool allowed_at_end_of_operator(char c) { |
175 | 20.6M | switch (c) { |
176 | 7.77M | case '+': |
177 | 8.33M | case '-': |
178 | 9.35M | case '~': |
179 | 10.4M | case '!': |
180 | 10.6M | case '$': return false; |
181 | 20.6M | } |
182 | 9.93M | return true; |
183 | 20.6M | } |
184 | | |
185 | | static const std::map<std::string, Token::Kind> keywords = { |
186 | | {"assert", Token::ASSERT}, |
187 | | {"else", Token::ELSE}, |
188 | | {"error", Token::ERROR}, |
189 | | {"false", Token::FALSE}, |
190 | | {"for", Token::FOR}, |
191 | | {"function", Token::FUNCTION}, |
192 | | {"if", Token::IF}, |
193 | | {"import", Token::IMPORT}, |
194 | | {"importstr", Token::IMPORTSTR}, |
195 | | {"importbin", Token::IMPORTBIN}, |
196 | | {"in", Token::IN}, |
197 | | {"local", Token::LOCAL}, |
198 | | {"null", Token::NULL_LIT}, |
199 | | {"self", Token::SELF}, |
200 | | {"super", Token::SUPER}, |
201 | | {"tailstrict", Token::TAILSTRICT}, |
202 | | {"then", Token::THEN}, |
203 | | {"true", Token::TRUE}, |
204 | | }; |
205 | | |
206 | | Token::Kind lex_get_keyword_kind(const std::string &identifier) |
207 | 138M | { |
208 | 138M | auto it = keywords.find(identifier); |
209 | 138M | if (it == keywords.end()) |
210 | 101M | return Token::IDENTIFIER; |
211 | 36.9M | return it->second; |
212 | 138M | } |
213 | | |
214 | | std::string lex_number(const char *&c, const std::string &filename, const Location &begin) |
215 | 19.7M | { |
216 | | // This function should be understood with reference to the linked image: |
217 | | // https://www.json.org/img/number.png |
218 | | |
219 | | // Note, we deviate from the json.org documentation as follows: |
220 | | // There is no reason to lex negative numbers as atomic tokens, it is better to parse them |
221 | | // as a unary operator combined with a numeric literal. This avoids x-1 being tokenized as |
222 | | // <identifier> <number> instead of the intended <identifier> <binop> <number>. |
223 | | |
224 | 19.7M | enum State { |
225 | 19.7M | BEGIN, |
226 | 19.7M | AFTER_ZERO, |
227 | 19.7M | AFTER_ONE_TO_NINE, |
228 | 19.7M | AFTER_DOT, |
229 | 19.7M | AFTER_DIGIT, |
230 | 19.7M | AFTER_E, |
231 | 19.7M | AFTER_EXP_SIGN, |
232 | 19.7M | AFTER_EXP_DIGIT |
233 | 19.7M | } state; |
234 | | |
235 | 19.7M | std::string r; |
236 | | |
237 | 19.7M | state = BEGIN; |
238 | 45.4M | while (true) { |
239 | 45.4M | switch (state) { |
240 | 19.7M | case BEGIN: |
241 | 19.7M | switch (*c) { |
242 | 8.02M | case '0': state = AFTER_ZERO; break; |
243 | | |
244 | 4.34M | case '1': |
245 | 5.20M | case '2': |
246 | 6.00M | case '3': |
247 | 7.27M | case '4': |
248 | 7.50M | case '5': |
249 | 8.34M | case '6': |
250 | 8.68M | case '7': |
251 | 10.4M | case '8': |
252 | 11.6M | case '9': state = AFTER_ONE_TO_NINE; break; |
253 | | |
254 | 0 | default: throw StaticError(filename, begin, "couldn't lex number"); |
255 | 19.7M | } |
256 | 19.7M | break; |
257 | | |
258 | 19.7M | case AFTER_ZERO: |
259 | 8.02M | switch (*c) { |
260 | 47.9k | case '.': state = AFTER_DOT; break; |
261 | | |
262 | 638 | case 'e': |
263 | 3.74k | case 'E': state = AFTER_E; break; |
264 | | |
265 | 7.97M | default: goto end; |
266 | 8.02M | } |
267 | 51.6k | break; |
268 | | |
269 | 14.6M | case AFTER_ONE_TO_NINE: |
270 | 14.6M | switch (*c) { |
271 | 30.6k | case '.': state = AFTER_DOT; break; |
272 | | |
273 | 2.46k | case 'e': |
274 | 442k | case 'E': state = AFTER_E; break; |
275 | | |
276 | 1.05M | case '0': |
277 | 1.10M | case '1': |
278 | 1.37M | case '2': |
279 | 1.46M | case '3': |
280 | 1.58M | case '4': |
281 | 1.81M | case '5': |
282 | 1.96M | case '6': |
283 | 2.04M | case '7': |
284 | 2.12M | case '8': |
285 | 2.96M | case '9': state = AFTER_ONE_TO_NINE; break; |
286 | | |
287 | 11.2M | default: goto end; |
288 | 14.6M | } |
289 | 3.43M | break; |
290 | | |
291 | 3.43M | case AFTER_DOT: |
292 | 78.5k | switch (*c) { |
293 | 3.35k | case '0': |
294 | 28.1k | case '1': |
295 | 28.5k | case '2': |
296 | 29.4k | case '3': |
297 | 30.2k | case '4': |
298 | 76.6k | case '5': |
299 | 77.4k | case '6': |
300 | 77.6k | case '7': |
301 | 78.1k | case '8': |
302 | 78.5k | case '9': state = AFTER_DIGIT; break; |
303 | | |
304 | 27 | default: { |
305 | 27 | std::stringstream ss; |
306 | 27 | ss << "couldn't lex number, junk after decimal point: " << *c; |
307 | 27 | throw StaticError(filename, begin, ss.str()); |
308 | 78.1k | } |
309 | 78.5k | } |
310 | 78.5k | break; |
311 | | |
312 | 1.63M | case AFTER_DIGIT: |
313 | 1.63M | switch (*c) { |
314 | 2.37k | case 'e': |
315 | 3.32k | case 'E': state = AFTER_E; break; |
316 | | |
317 | 1.10M | case '0': |
318 | 1.18M | case '1': |
319 | 1.20M | case '2': |
320 | 1.25M | case '3': |
321 | 1.28M | case '4': |
322 | 1.35M | case '5': |
323 | 1.42M | case '6': |
324 | 1.44M | case '7': |
325 | 1.47M | case '8': |
326 | 1.55M | case '9': state = AFTER_DIGIT; break; |
327 | | |
328 | 75.2k | default: goto end; |
329 | 1.63M | } |
330 | 1.55M | break; |
331 | | |
332 | 1.55M | case AFTER_E: |
333 | 450k | switch (*c) { |
334 | 1.87k | case '+': |
335 | 4.75k | case '-': state = AFTER_EXP_SIGN; break; |
336 | | |
337 | 2.37k | case '0': |
338 | 4.56k | case '1': |
339 | 440k | case '2': |
340 | 442k | case '3': |
341 | 443k | case '4': |
342 | 444k | case '5': |
343 | 444k | case '6': |
344 | 444k | case '7': |
345 | 444k | case '8': |
346 | 445k | case '9': state = AFTER_EXP_DIGIT; break; |
347 | | |
348 | 82 | default: { |
349 | 82 | std::stringstream ss; |
350 | 82 | ss << "couldn't lex number, junk after 'E': " << *c; |
351 | 82 | throw StaticError(filename, begin, ss.str()); |
352 | 444k | } |
353 | 450k | } |
354 | 449k | break; |
355 | | |
356 | 449k | case AFTER_EXP_SIGN: |
357 | 4.75k | switch (*c) { |
358 | 550 | case '0': |
359 | 892 | case '1': |
360 | 1.05k | case '2': |
361 | 4.18k | case '3': |
362 | 4.64k | case '4': |
363 | 4.64k | case '5': |
364 | 4.65k | case '6': |
365 | 4.65k | case '7': |
366 | 4.66k | case '8': |
367 | 4.74k | case '9': state = AFTER_EXP_DIGIT; break; |
368 | | |
369 | 16 | default: { |
370 | 16 | std::stringstream ss; |
371 | 16 | ss << "couldn't lex number, junk after exponent sign: " << *c; |
372 | 16 | throw StaticError(filename, begin, ss.str()); |
373 | 4.66k | } |
374 | 4.75k | } |
375 | 4.74k | break; |
376 | | |
377 | 905k | case AFTER_EXP_DIGIT: |
378 | 905k | switch (*c) { |
379 | 4.65k | case '0': |
380 | 8.06k | case '1': |
381 | 10.7k | case '2': |
382 | 443k | case '3': |
383 | 446k | case '4': |
384 | 448k | case '5': |
385 | 449k | case '6': |
386 | 451k | case '7': |
387 | 453k | case '8': |
388 | 455k | case '9': state = AFTER_EXP_DIGIT; break; |
389 | | |
390 | 449k | default: goto end; |
391 | 905k | } |
392 | 455k | break; |
393 | 45.4M | } |
394 | 25.7M | r += *c; |
395 | 25.7M | c++; |
396 | 25.7M | } |
397 | 19.7M | end: |
398 | 19.7M | return r; |
399 | 19.7M | } |
400 | | |
401 | | // Check that b has at least the same whitespace prefix as a and returns the amount of this |
402 | | // whitespace, otherwise returns 0. If a has no whitespace prefix than return 0. |
403 | | static int whitespace_check(const char *a, const char *b) |
404 | 40.3k | { |
405 | 40.3k | int i = 0; |
406 | 651k | while (a[i] == ' ' || a[i] == '\t') { |
407 | 628k | if (b[i] != a[i]) |
408 | 17.7k | return 0; |
409 | 610k | i++; |
410 | 610k | } |
411 | 22.5k | return i; |
412 | 40.3k | } |
413 | | |
414 | | /* |
415 | | static void add_whitespace(Fodder &fodder, const char *s, size_t n) |
416 | | { |
417 | | std::string ws(s, n); |
418 | | if (fodder.size() == 0 || fodder.back().kind != FodderElement::WHITESPACE) { |
419 | | fodder.emplace_back(FodderElement::WHITESPACE, ws); |
420 | | } else { |
421 | | fodder.back().data += ws; |
422 | | } |
423 | | } |
424 | | */ |
425 | | |
426 | | Tokens jsonnet_lex(const std::string &filename, const char *input) |
427 | 46.3k | { |
428 | 46.3k | unsigned long line_number = 1; |
429 | 46.3k | const char *line_start = input; |
430 | | |
431 | 46.3k | Tokens r; |
432 | | |
433 | 46.3k | const char *c = input; |
434 | | |
435 | 46.3k | Fodder fodder; |
436 | 46.3k | bool fresh_line = true; // Are we tokenizing from the beginning of a new line? |
437 | | |
438 | 339M | while (*c != '\0') { |
439 | | // Used to ensure we have actually advanced the pointer by the end of the iteration. |
440 | 339M | const char *original_c = c; |
441 | | |
442 | 339M | Token::Kind kind; |
443 | 339M | std::string data; |
444 | 339M | std::string string_block_indent; |
445 | 339M | std::string string_block_term_indent; |
446 | | |
447 | 339M | unsigned new_lines, indent; |
448 | 339M | lex_ws(c, new_lines, indent, line_start, line_number); |
449 | | |
450 | | // If it's the end of the file, discard final whitespace. |
451 | 339M | if (*c == '\0') |
452 | 23.7k | break; |
453 | | |
454 | 339M | if (new_lines > 0) { |
455 | | // Otherwise store whitespace in fodder. |
456 | 34.6M | unsigned blanks = new_lines - 1; |
457 | 34.6M | fodder.emplace_back(FodderElement::LINE_END, blanks, indent, EMPTY); |
458 | 34.6M | fresh_line = true; |
459 | 34.6M | } |
460 | | |
461 | 339M | Location begin(line_number, c - line_start + 1); |
462 | | |
463 | 339M | switch (*c) { |
464 | | // The following operators should never be combined with subsequent symbols. |
465 | 1.43M | case '{': |
466 | 1.43M | kind = Token::BRACE_L; |
467 | 1.43M | c++; |
468 | 1.43M | break; |
469 | | |
470 | 1.41M | case '}': |
471 | 1.41M | kind = Token::BRACE_R; |
472 | 1.41M | c++; |
473 | 1.41M | break; |
474 | | |
475 | 7.34M | case '[': |
476 | 7.34M | kind = Token::BRACKET_L; |
477 | 7.34M | c++; |
478 | 7.34M | break; |
479 | | |
480 | 7.31M | case ']': |
481 | 7.31M | kind = Token::BRACKET_R; |
482 | 7.31M | c++; |
483 | 7.31M | break; |
484 | | |
485 | 30.8M | case ',': |
486 | 30.8M | kind = Token::COMMA; |
487 | 30.8M | c++; |
488 | 30.8M | break; |
489 | | |
490 | 16.5M | case '.': |
491 | 16.5M | kind = Token::DOT; |
492 | 16.5M | c++; |
493 | 16.5M | break; |
494 | | |
495 | 25.0M | case '(': |
496 | 25.0M | kind = Token::PAREN_L; |
497 | 25.0M | c++; |
498 | 25.0M | break; |
499 | | |
500 | 25.0M | case ')': |
501 | 25.0M | kind = Token::PAREN_R; |
502 | 25.0M | c++; |
503 | 25.0M | break; |
504 | | |
505 | 6.62M | case ';': |
506 | 6.62M | kind = Token::SEMICOLON; |
507 | 6.62M | c++; |
508 | 6.62M | break; |
509 | | |
510 | | // Numeric literals. |
511 | 8.02M | case '0': |
512 | 12.3M | case '1': |
513 | 13.2M | case '2': |
514 | 14.0M | case '3': |
515 | 15.3M | case '4': |
516 | 15.5M | case '5': |
517 | 16.3M | case '6': |
518 | 16.7M | case '7': |
519 | 18.5M | case '8': |
520 | 19.7M | case '9': |
521 | 19.7M | kind = Token::NUMBER; |
522 | 19.7M | data = lex_number(c, filename, begin); |
523 | 19.7M | break; |
524 | | |
525 | | // UString literals. |
526 | 320k | case '"': { |
527 | 320k | c++; |
528 | 82.2M | for (;; ++c) { |
529 | 82.2M | if (*c == '\0') { |
530 | 88 | throw StaticError(filename, begin, "unterminated string"); |
531 | 88 | } |
532 | 82.2M | if (*c == '"') { |
533 | 320k | break; |
534 | 320k | } |
535 | 81.8M | if (*c == '\\' && *(c + 1) != '\0') { |
536 | 199k | data += *c; |
537 | 199k | ++c; |
538 | 199k | } |
539 | 81.8M | if (*c == '\n') { |
540 | | // Maintain line/column counters. |
541 | 7.53M | line_number++; |
542 | 7.53M | line_start = c + 1; |
543 | 7.53M | } |
544 | 81.8M | data += *c; |
545 | 81.8M | } |
546 | 320k | c++; // Advance beyond the ". |
547 | 320k | kind = Token::STRING_DOUBLE; |
548 | 320k | } break; |
549 | | |
550 | | // UString literals. |
551 | 10.8M | case '\'': { |
552 | 10.8M | c++; |
553 | 142M | for (;; ++c) { |
554 | 142M | if (*c == '\0') { |
555 | 82 | throw StaticError(filename, begin, "unterminated string"); |
556 | 82 | } |
557 | 142M | if (*c == '\'') { |
558 | 10.8M | break; |
559 | 10.8M | } |
560 | 131M | if (*c == '\\' && *(c + 1) != '\0') { |
561 | 1.18M | data += *c; |
562 | 1.18M | ++c; |
563 | 1.18M | } |
564 | 131M | if (*c == '\n') { |
565 | | // Maintain line/column counters. |
566 | 3.81M | line_number++; |
567 | 3.81M | line_start = c + 1; |
568 | 3.81M | } |
569 | 131M | data += *c; |
570 | 131M | } |
571 | 10.8M | c++; // Advance beyond the '. |
572 | 10.8M | kind = Token::STRING_SINGLE; |
573 | 10.8M | } break; |
574 | | |
575 | | // Verbatim string literals. |
576 | | // ' and " quoting is interpreted here, unlike non-verbatim strings |
577 | | // where it is done later by jsonnet_string_unescape. This is OK |
578 | | // in this case because no information is lost by resoving the |
579 | | // repeated quote into a single quote, so we can go back to the |
580 | | // original form in the formatter. |
581 | 10.6k | case '@': { |
582 | 10.6k | c++; |
583 | 10.6k | if (*c != '"' && *c != '\'') { |
584 | 37 | std::stringstream ss; |
585 | 37 | ss << "couldn't lex verbatim string, junk after '@': " << *c; |
586 | 37 | throw StaticError(filename, begin, ss.str()); |
587 | 37 | } |
588 | 10.6k | const char quot = *c; |
589 | 10.6k | c++; // Advance beyond the opening quote. |
590 | 454k | for (;; ++c) { |
591 | 454k | if (*c == '\0') { |
592 | 75 | throw StaticError(filename, begin, "unterminated verbatim string"); |
593 | 75 | } |
594 | 454k | if (*c == quot) { |
595 | 13.4k | if (*(c + 1) == quot) { |
596 | 2.95k | c++; |
597 | 10.5k | } else { |
598 | 10.5k | break; |
599 | 10.5k | } |
600 | 13.4k | } |
601 | 444k | data += *c; |
602 | 444k | } |
603 | 10.5k | c++; // Advance beyond the closing quote. |
604 | 10.5k | if (quot == '"') { |
605 | 7.62k | kind = Token::VERBATIM_STRING_DOUBLE; |
606 | 7.62k | } else { |
607 | 2.92k | kind = Token::VERBATIM_STRING_SINGLE; |
608 | 2.92k | } |
609 | 10.5k | } break; |
610 | | |
611 | | // Keywords |
612 | 186M | default: |
613 | 186M | if (is_identifier_first(*c)) { |
614 | 138M | std::string id; |
615 | 760M | for (; is_identifier(*c); ++c) |
616 | 622M | id += *c; |
617 | 138M | kind = lex_get_keyword_kind(id); |
618 | 138M | data = id; |
619 | | |
620 | 138M | } else if (is_symbol(*c) || *c == '#') { |
621 | | // Single line C++ and Python style comments. |
622 | 48.2M | if (*c == '#' || (*c == '/' && *(c + 1) == '/')) { |
623 | 10.7M | std::vector<std::string> comment(1); |
624 | 10.7M | unsigned blanks; |
625 | 10.7M | unsigned indent; |
626 | 10.7M | lex_until_newline(c, comment[0], blanks, indent, line_start, line_number); |
627 | 10.7M | auto kind = fresh_line ? FodderElement::PARAGRAPH : FodderElement::LINE_END; |
628 | 10.7M | fodder.emplace_back(kind, blanks, indent, comment); |
629 | 10.7M | fresh_line = true; |
630 | 10.7M | continue; // We've not got a token, just fodder, so keep scanning. |
631 | 10.7M | } |
632 | | |
633 | | // Multi-line C style comment. |
634 | 37.4M | if (*c == '/' && *(c + 1) == '*') { |
635 | 1.79M | unsigned margin = c - line_start; |
636 | | |
637 | 1.79M | const char *initial_c = c; |
638 | 1.79M | c += 2; // Avoid matching /*/: skip the /* before starting the search for |
639 | | // */. |
640 | | |
641 | 135M | while (!(*c == '*' && *(c + 1) == '/')) { |
642 | 133M | if (*c == '\0') { |
643 | 222 | auto msg = "multi-line comment has no terminating */."; |
644 | 222 | throw StaticError(filename, begin, msg); |
645 | 222 | } |
646 | 133M | if (*c == '\n') { |
647 | | // Just keep track of the line / column counters. |
648 | 19.1M | line_number++; |
649 | 19.1M | line_start = c + 1; |
650 | 19.1M | } |
651 | 133M | ++c; |
652 | 133M | } |
653 | 1.79M | c += 2; // Move the pointer to the char after the closing '/'. |
654 | | |
655 | 1.79M | std::string comment(initial_c, |
656 | 1.79M | c - initial_c); // Includes the "/*" and "*/". |
657 | | |
658 | | // Lex whitespace after comment |
659 | 1.79M | unsigned new_lines_after, indent_after; |
660 | 1.79M | lex_ws(c, new_lines_after, indent_after, line_start, line_number); |
661 | 1.79M | std::vector<std::string> lines; |
662 | 1.79M | if (comment.find('\n') >= comment.length()) { |
663 | | // Comment looks like /* foo */ |
664 | 1.26M | lines.push_back(comment); |
665 | 1.26M | fodder.emplace_back(FodderElement::INTERSTITIAL, 0, 0, lines); |
666 | 1.26M | if (new_lines_after > 0) { |
667 | 1.20M | fodder.emplace_back(FodderElement::LINE_END, |
668 | 1.20M | new_lines_after - 1, |
669 | 1.20M | indent_after, |
670 | 1.20M | EMPTY); |
671 | 1.20M | fresh_line = true; |
672 | 1.20M | } |
673 | 1.26M | } else { |
674 | 524k | lines = line_split(comment, margin); |
675 | 524k | assert(lines[0][0] == '/'); |
676 | | // Little hack to support PARAGRAPHs with * down the LHS: |
677 | | // Add a space to lines that start with a '*' |
678 | 524k | bool all_star = true; |
679 | 19.6M | for (auto &l : lines) { |
680 | 19.6M | if (l[0] != '*') |
681 | 19.4M | all_star = false; |
682 | 19.6M | } |
683 | 524k | if (all_star) { |
684 | 0 | for (auto &l : lines) { |
685 | 0 | if (l[0] == '*') |
686 | 0 | l = " " + l; |
687 | 0 | } |
688 | 0 | } |
689 | 524k | if (new_lines_after == 0) { |
690 | | // Ensure a line end after the paragraph. |
691 | 26.9k | new_lines_after = 1; |
692 | 26.9k | indent_after = 0; |
693 | 26.9k | } |
694 | 524k | fodder_push_back(fodder, |
695 | 524k | FodderElement(FodderElement::PARAGRAPH, |
696 | 524k | new_lines_after - 1, |
697 | 524k | indent_after, |
698 | 524k | lines)); |
699 | 524k | fresh_line = true; |
700 | 524k | } |
701 | 1.79M | continue; // We've not got a token, just fodder, so keep scanning. |
702 | 1.79M | } |
703 | | |
704 | | // Text block |
705 | 35.6M | if (*c == '|' && *(c + 1) == '|' && *(c + 2) == '|') { |
706 | 18.1k | c += 3; // Skip the "|||". |
707 | | |
708 | 18.1k | bool chomp_trailing_nl = false; |
709 | 18.1k | if (*c == '-') { |
710 | 1.07k | chomp_trailing_nl = true; |
711 | 1.07k | c++; |
712 | 1.07k | } |
713 | | |
714 | 21.6k | while (is_horz_ws(*c)) ++c; // Chomp whitespace at end of line. |
715 | 18.1k | if (*c != '\n') { |
716 | 112 | auto msg = "text block syntax requires new line after |||."; |
717 | 112 | throw StaticError(filename, begin, msg); |
718 | 112 | } |
719 | 18.0k | std::stringstream block; |
720 | 18.0k | c++; // Skip the "\n" |
721 | 18.0k | line_number++; |
722 | | // Skip any blank lines at the beginning of the block. |
723 | 22.4k | while (*c == '\n') { |
724 | 4.48k | line_number++; |
725 | 4.48k | ++c; |
726 | 4.48k | block << '\n'; |
727 | 4.48k | } |
728 | 18.0k | line_start = c; |
729 | 18.0k | const char *first_line = c; |
730 | 18.0k | int ws_chars = whitespace_check(first_line, c); |
731 | 18.0k | string_block_indent = std::string(first_line, ws_chars); |
732 | 18.0k | if (ws_chars == 0) { |
733 | 76 | auto msg = "text block's first line must start with whitespace."; |
734 | 76 | throw StaticError(filename, begin, msg); |
735 | 76 | } |
736 | 22.5k | while (true) { |
737 | 22.5k | assert(ws_chars > 0); |
738 | | // Read up to the \n |
739 | 12.9M | for (c = &c[ws_chars]; *c != '\n'; ++c) { |
740 | 12.9M | if (*c == '\0') |
741 | 173 | throw StaticError(filename, begin, "unexpected EOF"); |
742 | 12.9M | block << *c; |
743 | 12.9M | } |
744 | | // Add the \n |
745 | 22.3k | block << '\n'; |
746 | 22.3k | ++c; |
747 | 22.3k | line_number++; |
748 | 22.3k | line_start = c; |
749 | | // Skip any blank lines |
750 | 24.9k | while (*c == '\n') { |
751 | 2.56k | line_number++; |
752 | 2.56k | ++c; |
753 | 2.56k | block << '\n'; |
754 | 2.56k | } |
755 | | // Examine next line |
756 | 22.3k | ws_chars = whitespace_check(first_line, c); |
757 | 22.3k | if (ws_chars == 0) { |
758 | | // End of text block |
759 | | // Skip over any whitespace |
760 | 73.2k | while (*c == ' ' || *c == '\t') { |
761 | 55.4k | string_block_term_indent += *c; |
762 | 55.4k | ++c; |
763 | 55.4k | } |
764 | | // Expect ||| |
765 | 17.7k | if (!(*c == '|' && *(c + 1) == '|' && *(c + 2) == '|')) { |
766 | 229 | auto msg = "text block not terminated with |||"; |
767 | 229 | throw StaticError(filename, begin, msg); |
768 | 229 | } |
769 | 17.5k | c += 3; // Leave after the last | |
770 | 17.5k | data = block.str(); |
771 | 17.5k | kind = Token::STRING_BLOCK; |
772 | 17.5k | if (chomp_trailing_nl) { |
773 | 1.05k | assert(data.back() == '\n'); |
774 | 1.05k | data.pop_back(); |
775 | 1.05k | } |
776 | 17.5k | break; // Out of the while loop. |
777 | 17.5k | } |
778 | 22.3k | } |
779 | | |
780 | 17.5k | break; // Out of the switch. |
781 | 17.9k | } |
782 | | |
783 | 35.6M | const char *operator_begin = c; |
784 | 117M | for (; is_symbol(*c); ++c) { |
785 | | // Not allowed // in operators |
786 | 81.5M | if (*c == '/' && *(c + 1) == '/') |
787 | 1.29k | break; |
788 | | // Not allowed /* in operators |
789 | 81.5M | if (*c == '/' && *(c + 1) == '*') |
790 | 1.59k | break; |
791 | | // Not allowed ||| in operators |
792 | 81.5M | if (*c == '|' && *(c + 1) == '|' && *(c + 2) == '|') |
793 | 2.03k | break; |
794 | 81.5M | } |
795 | | // Not allowed to end with a + - ~ ! unless a single char. |
796 | | // So, wind it back if we need to (but not too far). |
797 | 46.3M | while (c > operator_begin + 1 && !allowed_at_end_of_operator(*(c - 1))) { |
798 | 10.6M | c--; |
799 | 10.6M | } |
800 | 35.6M | data += std::string(operator_begin, c); |
801 | 35.6M | if (data == "$") { |
802 | 107k | kind = Token::DOLLAR; |
803 | 107k | data = ""; |
804 | 35.5M | } else { |
805 | 35.5M | kind = Token::OPERATOR; |
806 | 35.5M | } |
807 | 35.6M | } else { |
808 | 411 | std::stringstream ss; |
809 | 411 | ss << "Could not lex the character "; |
810 | 411 | auto uc = (unsigned char)(*c); |
811 | 411 | if (*c < 32) |
812 | 374 | ss << "code " << unsigned(uc); |
813 | 37 | else |
814 | 37 | ss << "'" << *c << "'"; |
815 | 411 | throw StaticError(filename, begin, ss.str()); |
816 | 411 | } |
817 | 339M | } |
818 | | |
819 | | // Ensure that a bug in the above code does not cause an infinite memory consuming loop due |
820 | | // to pushing empty tokens. |
821 | 326M | if (c == original_c) { |
822 | 0 | throw StaticError(filename, begin, "internal lexing error: pointer did not advance"); |
823 | 0 | } |
824 | | |
825 | 326M | Location end(line_number, (c + 1) - line_start); |
826 | 326M | r.emplace_back(kind, |
827 | 326M | fodder, |
828 | 326M | data, |
829 | 326M | string_block_indent, |
830 | 326M | string_block_term_indent, |
831 | 326M | LocationRange(filename, begin, end)); |
832 | 326M | fodder.clear(); |
833 | 326M | fresh_line = false; |
834 | 326M | } |
835 | | |
836 | 44.7k | Location begin(line_number, c - line_start + 1); |
837 | 44.7k | Location end(line_number, (c + 1) - line_start + 1); |
838 | 44.7k | r.emplace_back(Token::END_OF_FILE, fodder, "", "", "", LocationRange(filename, begin, end)); |
839 | 44.7k | return r; |
840 | 46.3k | } |
841 | | |
842 | | std::string jsonnet_unlex(const Tokens &tokens) |
843 | 0 | { |
844 | 0 | std::stringstream ss; |
845 | 0 | for (const auto &t : tokens) { |
846 | 0 | for (const auto &f : t.fodder) { |
847 | 0 | switch (f.kind) { |
848 | 0 | case FodderElement::LINE_END: { |
849 | 0 | if (f.comment.size() > 0) { |
850 | 0 | ss << "LineEnd(" << f.blanks << ", " << f.indent << ", " << f.comment[0] |
851 | 0 | << ")\n"; |
852 | 0 | } else { |
853 | 0 | ss << "LineEnd(" << f.blanks << ", " << f.indent << ")\n"; |
854 | 0 | } |
855 | 0 | } break; |
856 | | |
857 | 0 | case FodderElement::INTERSTITIAL: { |
858 | 0 | ss << "Interstitial(" << f.comment[0] << ")\n"; |
859 | 0 | } break; |
860 | | |
861 | 0 | case FodderElement::PARAGRAPH: { |
862 | 0 | ss << "Paragraph(\n"; |
863 | 0 | for (const auto &line : f.comment) { |
864 | 0 | ss << " " << line << '\n'; |
865 | 0 | } |
866 | 0 | ss << ")" << f.blanks << "\n"; |
867 | 0 | } break; |
868 | 0 | } |
869 | 0 | } |
870 | 0 | if (t.kind == Token::END_OF_FILE) { |
871 | 0 | ss << "EOF\n"; |
872 | 0 | break; |
873 | 0 | } |
874 | 0 | if (t.kind == Token::STRING_DOUBLE) { |
875 | 0 | ss << "\"" << t.data << "\"\n"; |
876 | 0 | } else if (t.kind == Token::STRING_SINGLE) { |
877 | 0 | ss << "'" << t.data << "'\n"; |
878 | 0 | } else if (t.kind == Token::STRING_BLOCK) { |
879 | 0 | ss << "|||\n"; |
880 | 0 | ss << t.stringBlockIndent; |
881 | 0 | for (const char *cp = t.data.c_str(); *cp != '\0'; ++cp) { |
882 | 0 | ss << *cp; |
883 | 0 | if (*cp == '\n' && *(cp + 1) != '\n' && *(cp + 1) != '\0') { |
884 | 0 | ss << t.stringBlockIndent; |
885 | 0 | } |
886 | 0 | } |
887 | 0 | ss << t.stringBlockTermIndent << "|||\n"; |
888 | 0 | } else { |
889 | 0 | ss << t.data << "\n"; |
890 | 0 | } |
891 | 0 | } |
892 | 0 | return ss.str(); |
893 | 0 | } |
894 | | |
895 | | } // namespace jsonnet::internal |