/src/jsonnet/core/lexer.cpp
Line | Count | Source |
1 | | /* |
2 | | Copyright 2015 Google Inc. All rights reserved. |
3 | | |
4 | | Licensed under the Apache License, Version 2.0 (the "License"); |
5 | | you may not use this file except in compliance with the License. |
6 | | You may obtain a copy of the License at |
7 | | |
8 | | http://www.apache.org/licenses/LICENSE-2.0 |
9 | | |
10 | | Unless required by applicable law or agreed to in writing, software |
11 | | distributed under the License is distributed on an "AS IS" BASIS, |
12 | | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | | See the License for the specific language governing permissions and |
14 | | limitations under the License. |
15 | | */ |
16 | | |
17 | | #include <cassert> |
18 | | |
19 | | #include <map> |
20 | | #include <sstream> |
21 | | #include <string> |
22 | | |
23 | | #include "lexer.h" |
24 | | #include "static_error.h" |
25 | | #include "unicode.h" |
26 | | |
27 | | namespace jsonnet::internal { |
28 | | |
29 | | static const std::vector<std::string> EMPTY; |
30 | | |
31 | | /** Is the char whitespace (excluding \n). */ |
32 | | static bool is_horz_ws(char c) |
33 | 920M | { |
34 | 920M | return c == ' ' || c == '\t' || c == '\r'; |
35 | 920M | } |
36 | | |
37 | | /** Is the char whitespace. */ |
38 | | static bool is_ws(char c) |
39 | 828M | { |
40 | 828M | return c == '\n' || is_horz_ws(c); |
41 | 828M | } |
42 | | |
43 | | /** Strip whitespace from both ends of a string, but only up to margin on the left hand side. */ |
44 | | static std::string strip_ws(const std::string &s, unsigned margin) |
45 | 19.7M | { |
46 | 19.7M | if (s.size() == 0) |
47 | 15.3M | return s; // Avoid underflow below. |
48 | 4.45M | size_t i = 0; |
49 | 10.2M | while (i < s.length() && is_horz_ws(s[i]) && i < margin) |
50 | 5.82M | i++; |
51 | 4.45M | size_t j = s.size(); |
52 | 8.15M | while (j > i && is_horz_ws(s[j - 1])) { |
53 | 3.69M | j--; |
54 | 3.69M | } |
55 | 4.45M | return std::string(&s[i], &s[j]); |
56 | 19.7M | } |
57 | | |
58 | | /** Split a string by \n and also strip left (up to margin) & right whitespace from each line. */ |
59 | | static std::vector<std::string> line_split(const std::string &s, unsigned margin) |
60 | 428k | { |
61 | 428k | std::vector<std::string> ret; |
62 | 428k | std::stringstream ss; |
63 | 132M | for (size_t i = 0; i < s.length(); ++i) { |
64 | 132M | if (s[i] == '\n') { |
65 | 19.3M | ret.emplace_back(strip_ws(ss.str(), margin)); |
66 | 19.3M | ss.str(""); |
67 | 113M | } else { |
68 | 113M | ss << s[i]; |
69 | 113M | } |
70 | 132M | } |
71 | 428k | ret.emplace_back(strip_ws(ss.str(), margin)); |
72 | 428k | return ret; |
73 | 428k | } |
74 | | |
75 | | /** Consume whitespace. |
76 | | * |
77 | | * Return number of \n and number of spaces after last \n. Convert \t to spaces. |
78 | | */ |
79 | | static void lex_ws(const char *&c, unsigned &new_lines, unsigned &indent, const char *&line_start, |
80 | | unsigned long &line_number) |
81 | 365M | { |
82 | 365M | indent = 0; |
83 | 365M | new_lines = 0; |
84 | 828M | for (; *c != '\0' && is_ws(*c); c++) { |
85 | 463M | switch (*c) { |
86 | 957k | case '\r': |
87 | | // Ignore. |
88 | 957k | break; |
89 | | |
90 | 54.3M | case '\n': |
91 | 54.3M | indent = 0; |
92 | 54.3M | new_lines++; |
93 | 54.3M | line_number++; |
94 | 54.3M | line_start = c + 1; |
95 | 54.3M | break; |
96 | | |
97 | 408M | case ' ': indent += 1; break; |
98 | | |
99 | | // This only works for \t at the beginning of lines, but we strip it everywhere else |
100 | | // anyway. The only case where this will cause a problem is spaces followed by \t |
101 | | // at the beginning of a line. However that is rare, ill-advised, and if re-indentation |
102 | | // is enabled it will be fixed later. |
103 | 60.4k | case '\t': indent += 8; break; |
104 | 463M | } |
105 | 463M | } |
106 | 365M | } |
107 | | |
108 | | /** |
109 | | # Consume all text until the end of the line, return number of newlines after that and indent |
110 | | */ |
111 | | static void lex_until_newline(const char *&c, std::string &text, unsigned &blanks, unsigned &indent, |
112 | | const char *&line_start, unsigned long &line_number) |
113 | 11.4M | { |
114 | 11.4M | const char *original_c = c; |
115 | 11.4M | const char *last_non_space = c; |
116 | 139M | for (; *c != '\0' && *c != '\n'; c++) { |
117 | 128M | if (!is_horz_ws(*c)) |
118 | 110M | last_non_space = c; |
119 | 128M | } |
120 | 11.4M | text = std::string(original_c, last_non_space - original_c + 1); |
121 | | // Consume subsequent whitespace including the '\n'. |
122 | 11.4M | unsigned new_lines; |
123 | 11.4M | lex_ws(c, new_lines, indent, line_start, line_number); |
124 | 11.4M | blanks = new_lines == 0 ? 0 : new_lines - 1; |
125 | 11.4M | } |
126 | | |
127 | | static bool is_upper(char c) |
128 | 984M | { |
129 | 984M | return c >= 'A' && c <= 'Z'; |
130 | 984M | } |
131 | | |
132 | | static bool is_lower(char c) |
133 | 967M | { |
134 | 967M | return c >= 'a' && c <= 'z'; |
135 | 967M | } |
136 | | |
137 | | static bool is_number(char c) |
138 | 173M | { |
139 | 173M | return c >= '0' && c <= '9'; |
140 | 173M | } |
141 | | |
142 | | static bool is_identifier_first(char c) |
143 | 984M | { |
144 | 984M | return is_upper(c) || is_lower(c) || c == '_'; |
145 | 984M | } |
146 | | |
147 | | static bool is_identifier(char c) |
148 | 790M | { |
149 | 790M | return is_identifier_first(c) || is_number(c); |
150 | 790M | } |
151 | | |
152 | | static bool is_symbol(char c) |
153 | 170M | { |
154 | 170M | switch (c) { |
155 | 3.76M | case '!': |
156 | 4.26M | case '$': |
157 | 21.9M | case ':': |
158 | 23.1M | case '~': |
159 | 51.5M | case '+': |
160 | 54.8M | case '-': |
161 | 58.0M | case '&': |
162 | 59.8M | case '|': |
163 | 59.8M | case '^': |
164 | 93.8M | case '=': |
165 | 96.8M | case '<': |
166 | 100M | case '>': |
167 | 118M | case '*': |
168 | 122M | case '/': |
169 | 124M | case '%': return true; |
170 | 170M | } |
171 | 46.1M | return false; |
172 | 170M | } |
173 | | |
174 | 22.2M | bool allowed_at_end_of_operator(char c) { |
175 | 22.2M | switch (c) { |
176 | 9.04M | case '+': |
177 | 9.64M | case '-': |
178 | 10.8M | case '~': |
179 | 11.6M | case '!': |
180 | 11.9M | case '$': return false; |
181 | 22.2M | } |
182 | 10.3M | return true; |
183 | 22.2M | } |
184 | | |
185 | | static const std::map<std::string, Token::Kind> keywords = { |
186 | | {"assert", Token::ASSERT}, |
187 | | {"else", Token::ELSE}, |
188 | | {"error", Token::ERROR}, |
189 | | {"false", Token::FALSE}, |
190 | | {"for", Token::FOR}, |
191 | | {"function", Token::FUNCTION}, |
192 | | {"if", Token::IF}, |
193 | | {"import", Token::IMPORT}, |
194 | | {"importstr", Token::IMPORTSTR}, |
195 | | {"importbin", Token::IMPORTBIN}, |
196 | | {"in", Token::IN}, |
197 | | {"local", Token::LOCAL}, |
198 | | {"null", Token::NULL_LIT}, |
199 | | {"self", Token::SELF}, |
200 | | {"super", Token::SUPER}, |
201 | | {"tailstrict", Token::TAILSTRICT}, |
202 | | {"then", Token::THEN}, |
203 | | {"true", Token::TRUE}, |
204 | | }; |
205 | | |
206 | | Token::Kind lex_get_keyword_kind(const std::string &identifier) |
207 | 143M | { |
208 | 143M | auto it = keywords.find(identifier); |
209 | 143M | if (it == keywords.end()) |
210 | 105M | return Token::IDENTIFIER; |
211 | 38.3M | return it->second; |
212 | 143M | } |
213 | | |
214 | | std::string lex_number(const char *&c, const std::string &filename, const Location &begin) |
215 | 21.4M | { |
216 | | // This function should be understood with reference to the linked image: |
217 | | // https://www.json.org/img/number.png |
218 | | |
219 | | // Note, we deviate from the json.org documentation as follows: |
220 | | // There is no reason to lex negative numbers as atomic tokens, it is better to parse them |
221 | | // as a unary operator combined with a numeric literal. This avoids x-1 being tokenized as |
222 | | // <identifier> <number> instead of the intended <identifier> <binop> <number>. |
223 | | |
224 | 21.4M | enum State { |
225 | 21.4M | BEGIN, |
226 | 21.4M | AFTER_ZERO, |
227 | 21.4M | AFTER_ONE_TO_NINE, |
228 | 21.4M | AFTER_DOT, |
229 | 21.4M | AFTER_DIGIT, |
230 | 21.4M | AFTER_E, |
231 | 21.4M | AFTER_EXP_SIGN, |
232 | 21.4M | AFTER_EXP_DIGIT |
233 | 21.4M | } state; |
234 | | |
235 | 21.4M | std::string r; |
236 | | |
237 | 21.4M | state = BEGIN; |
238 | 48.5M | while (true) { |
239 | 48.5M | switch (state) { |
240 | 21.4M | case BEGIN: |
241 | 21.4M | switch (*c) { |
242 | 10.3M | case '0': state = AFTER_ZERO; break; |
243 | | |
244 | 4.47M | case '1': |
245 | 5.35M | case '2': |
246 | 6.16M | case '3': |
247 | 7.23M | case '4': |
248 | 7.44M | case '5': |
249 | 8.28M | case '6': |
250 | 8.67M | case '7': |
251 | 10.1M | case '8': |
252 | 11.1M | case '9': state = AFTER_ONE_TO_NINE; break; |
253 | | |
254 | 0 | default: throw StaticError(filename, begin, "couldn't lex number"); |
255 | 21.4M | } |
256 | 21.4M | break; |
257 | | |
258 | 21.4M | case AFTER_ZERO: |
259 | 10.3M | switch (*c) { |
260 | 53.1k | case '.': state = AFTER_DOT; break; |
261 | | |
262 | 782 | case 'e': |
263 | 4.63k | case 'E': state = AFTER_E; break; |
264 | | |
265 | 10.2M | default: goto end; |
266 | 10.3M | } |
267 | 57.7k | break; |
268 | | |
269 | 14.1M | case AFTER_ONE_TO_NINE: |
270 | 14.1M | switch (*c) { |
271 | 36.7k | case '.': state = AFTER_DOT; break; |
272 | | |
273 | 2.90k | case 'e': |
274 | 275k | case 'E': state = AFTER_E; break; |
275 | | |
276 | 1.08M | case '0': |
277 | 1.12M | case '1': |
278 | 1.41M | case '2': |
279 | 1.51M | case '3': |
280 | 1.62M | case '4': |
281 | 1.87M | case '5': |
282 | 2.02M | case '6': |
283 | 2.10M | case '7': |
284 | 2.18M | case '8': |
285 | 2.99M | case '9': state = AFTER_ONE_TO_NINE; break; |
286 | | |
287 | 10.8M | default: goto end; |
288 | 14.1M | } |
289 | 3.30M | break; |
290 | | |
291 | 3.30M | case AFTER_DOT: |
292 | 89.8k | switch (*c) { |
293 | 3.73k | case '0': |
294 | 30.3k | case '1': |
295 | 31.8k | case '2': |
296 | 33.6k | case '3': |
297 | 35.2k | case '4': |
298 | 84.5k | case '5': |
299 | 85.5k | case '6': |
300 | 86.2k | case '7': |
301 | 88.5k | case '8': |
302 | 89.8k | case '9': state = AFTER_DIGIT; break; |
303 | | |
304 | 26 | default: { |
305 | 26 | std::stringstream ss; |
306 | 26 | ss << "couldn't lex number, junk after decimal point: " << *c; |
307 | 26 | throw StaticError(filename, begin, ss.str()); |
308 | 88.5k | } |
309 | 89.8k | } |
310 | 89.8k | break; |
311 | | |
312 | 1.66M | case AFTER_DIGIT: |
313 | 1.66M | switch (*c) { |
314 | 1.95k | case 'e': |
315 | 3.08k | case 'E': state = AFTER_E; break; |
316 | | |
317 | 1.10M | case '0': |
318 | 1.18M | case '1': |
319 | 1.20M | case '2': |
320 | 1.26M | case '3': |
321 | 1.29M | case '4': |
322 | 1.37M | case '5': |
323 | 1.43M | case '6': |
324 | 1.46M | case '7': |
325 | 1.50M | case '8': |
326 | 1.57M | case '9': state = AFTER_DIGIT; break; |
327 | | |
328 | 86.7k | default: goto end; |
329 | 1.66M | } |
330 | 1.58M | break; |
331 | | |
332 | 1.58M | case AFTER_E: |
333 | 283k | switch (*c) { |
334 | 1.96k | case '+': |
335 | 3.46k | case '-': state = AFTER_EXP_SIGN; break; |
336 | | |
337 | 3.09k | case '0': |
338 | 5.78k | case '1': |
339 | 275k | case '2': |
340 | 277k | case '3': |
341 | 277k | case '4': |
342 | 278k | case '5': |
343 | 278k | case '6': |
344 | 278k | case '7': |
345 | 279k | case '8': |
346 | 279k | case '9': state = AFTER_EXP_DIGIT; break; |
347 | | |
348 | 86 | default: { |
349 | 86 | std::stringstream ss; |
350 | 86 | ss << "couldn't lex number, junk after 'E': " << *c; |
351 | 86 | throw StaticError(filename, begin, ss.str()); |
352 | 279k | } |
353 | 283k | } |
354 | 283k | break; |
355 | | |
356 | 283k | case AFTER_EXP_SIGN: |
357 | 3.46k | switch (*c) { |
358 | 596 | case '0': |
359 | 1.05k | case '1': |
360 | 1.15k | case '2': |
361 | 2.96k | case '3': |
362 | 3.32k | case '4': |
363 | 3.33k | case '5': |
364 | 3.33k | case '6': |
365 | 3.34k | case '7': |
366 | 3.35k | case '8': |
367 | 3.44k | case '9': state = AFTER_EXP_DIGIT; break; |
368 | | |
369 | 18 | default: { |
370 | 18 | std::stringstream ss; |
371 | 18 | ss << "couldn't lex number, junk after exponent sign: " << *c; |
372 | 18 | throw StaticError(filename, begin, ss.str()); |
373 | 3.35k | } |
374 | 3.46k | } |
375 | 3.44k | break; |
376 | | |
377 | 568k | case AFTER_EXP_DIGIT: |
378 | 568k | switch (*c) { |
379 | 4.96k | case '0': |
380 | 7.33k | case '1': |
381 | 8.75k | case '2': |
382 | 276k | case '3': |
383 | 278k | case '4': |
384 | 279k | case '5': |
385 | 279k | case '6': |
386 | 282k | case '7': |
387 | 283k | case '8': |
388 | 285k | case '9': state = AFTER_EXP_DIGIT; break; |
389 | | |
390 | 283k | default: goto end; |
391 | 568k | } |
392 | 285k | break; |
393 | 48.5M | } |
394 | 27.0M | r += *c; |
395 | 27.0M | c++; |
396 | 27.0M | } |
397 | 21.4M | end: |
398 | 21.4M | return r; |
399 | 21.4M | } |
400 | | |
401 | | // Check that b has at least the same whitespace prefix as a and returns the amount of this |
402 | | // whitespace, otherwise returns 0. If a has no whitespace prefix than return 0. |
403 | | static int whitespace_check(const char *a, const char *b) |
404 | 39.4k | { |
405 | 39.4k | int i = 0; |
406 | 706k | while (a[i] == ' ' || a[i] == '\t') { |
407 | 684k | if (b[i] != a[i]) |
408 | 17.2k | return 0; |
409 | 666k | i++; |
410 | 666k | } |
411 | 22.2k | return i; |
412 | 39.4k | } |
413 | | |
414 | | /* |
415 | | static void add_whitespace(Fodder &fodder, const char *s, size_t n) |
416 | | { |
417 | | std::string ws(s, n); |
418 | | if (fodder.size() == 0 || fodder.back().kind != FodderElement::WHITESPACE) { |
419 | | fodder.emplace_back(FodderElement::WHITESPACE, ws); |
420 | | } else { |
421 | | fodder.back().data += ws; |
422 | | } |
423 | | } |
424 | | */ |
425 | | |
426 | | Tokens jsonnet_lex(const std::string &filename, const char *input) |
427 | 47.7k | { |
428 | 47.7k | unsigned long line_number = 1; |
429 | 47.7k | const char *line_start = input; |
430 | | |
431 | 47.7k | Tokens r; |
432 | | |
433 | 47.7k | const char *c = input; |
434 | | |
435 | 47.7k | Fodder fodder; |
436 | 47.7k | bool fresh_line = true; // Are we tokenizing from the beginning of a new line? |
437 | | |
438 | 352M | while (*c != '\0') { |
439 | | // Used to ensure we have actually advanced the pointer by the end of the iteration. |
440 | 352M | const char *original_c = c; |
441 | | |
442 | 352M | Token::Kind kind; |
443 | 352M | std::string data; |
444 | 352M | std::string string_block_indent; |
445 | 352M | std::string string_block_term_indent; |
446 | | |
447 | 352M | unsigned new_lines, indent; |
448 | 352M | lex_ws(c, new_lines, indent, line_start, line_number); |
449 | | |
450 | | // If it's the end of the file, discard final whitespace. |
451 | 352M | if (*c == '\0') |
452 | 24.6k | break; |
453 | | |
454 | 352M | if (new_lines > 0) { |
455 | | // Otherwise store whitespace in fodder. |
456 | 35.9M | unsigned blanks = new_lines - 1; |
457 | 35.9M | fodder.emplace_back(FodderElement::LINE_END, blanks, indent, EMPTY); |
458 | 35.9M | fresh_line = true; |
459 | 35.9M | } |
460 | | |
461 | 352M | Location begin(line_number, c - line_start + 1); |
462 | | |
463 | 352M | switch (*c) { |
464 | | // The following operators should never be combined with subsequent symbols. |
465 | 1.52M | case '{': |
466 | 1.52M | kind = Token::BRACE_L; |
467 | 1.52M | c++; |
468 | 1.52M | break; |
469 | | |
470 | 1.50M | case '}': |
471 | 1.50M | kind = Token::BRACE_R; |
472 | 1.50M | c++; |
473 | 1.50M | break; |
474 | | |
475 | 7.66M | case '[': |
476 | 7.66M | kind = Token::BRACKET_L; |
477 | 7.66M | c++; |
478 | 7.66M | break; |
479 | | |
480 | 7.63M | case ']': |
481 | 7.63M | kind = Token::BRACKET_R; |
482 | 7.63M | c++; |
483 | 7.63M | break; |
484 | | |
485 | 30.9M | case ',': |
486 | 30.9M | kind = Token::COMMA; |
487 | 30.9M | c++; |
488 | 30.9M | break; |
489 | | |
490 | 17.2M | case '.': |
491 | 17.2M | kind = Token::DOT; |
492 | 17.2M | c++; |
493 | 17.2M | break; |
494 | | |
495 | 26.0M | case '(': |
496 | 26.0M | kind = Token::PAREN_L; |
497 | 26.0M | c++; |
498 | 26.0M | break; |
499 | | |
500 | 25.9M | case ')': |
501 | 25.9M | kind = Token::PAREN_R; |
502 | 25.9M | c++; |
503 | 25.9M | break; |
504 | | |
505 | 6.76M | case ';': |
506 | 6.76M | kind = Token::SEMICOLON; |
507 | 6.76M | c++; |
508 | 6.76M | break; |
509 | | |
510 | | // Numeric literals. |
511 | 10.3M | case '0': |
512 | 14.7M | case '1': |
513 | 15.6M | case '2': |
514 | 16.4M | case '3': |
515 | 17.5M | case '4': |
516 | 17.7M | case '5': |
517 | 18.5M | case '6': |
518 | 18.9M | case '7': |
519 | 20.4M | case '8': |
520 | 21.4M | case '9': |
521 | 21.4M | kind = Token::NUMBER; |
522 | 21.4M | data = lex_number(c, filename, begin); |
523 | 21.4M | break; |
524 | | |
525 | | // UString literals. |
526 | 416k | case '"': { |
527 | 416k | c++; |
528 | 87.0M | for (;; ++c) { |
529 | 87.0M | if (*c == '\0') { |
530 | 95 | throw StaticError(filename, begin, "unterminated string"); |
531 | 95 | } |
532 | 87.0M | if (*c == '"') { |
533 | 416k | break; |
534 | 416k | } |
535 | 86.6M | if (*c == '\\' && *(c + 1) != '\0') { |
536 | 241k | data += *c; |
537 | 241k | ++c; |
538 | 241k | } |
539 | 86.6M | if (*c == '\n') { |
540 | | // Maintain line/column counters. |
541 | 7.12M | line_number++; |
542 | 7.12M | line_start = c + 1; |
543 | 7.12M | } |
544 | 86.6M | data += *c; |
545 | 86.6M | } |
546 | 416k | c++; // Advance beyond the ". |
547 | 416k | kind = Token::STRING_DOUBLE; |
548 | 416k | } break; |
549 | | |
550 | | // UString literals. |
551 | 11.3M | case '\'': { |
552 | 11.3M | c++; |
553 | 149M | for (;; ++c) { |
554 | 149M | if (*c == '\0') { |
555 | 82 | throw StaticError(filename, begin, "unterminated string"); |
556 | 82 | } |
557 | 149M | if (*c == '\'') { |
558 | 11.3M | break; |
559 | 11.3M | } |
560 | 137M | if (*c == '\\' && *(c + 1) != '\0') { |
561 | 1.24M | data += *c; |
562 | 1.24M | ++c; |
563 | 1.24M | } |
564 | 137M | if (*c == '\n') { |
565 | | // Maintain line/column counters. |
566 | 3.64M | line_number++; |
567 | 3.64M | line_start = c + 1; |
568 | 3.64M | } |
569 | 137M | data += *c; |
570 | 137M | } |
571 | 11.3M | c++; // Advance beyond the '. |
572 | 11.3M | kind = Token::STRING_SINGLE; |
573 | 11.3M | } break; |
574 | | |
575 | | // Verbatim string literals. |
576 | | // ' and " quoting is interpreted here, unlike non-verbatim strings |
577 | | // where it is done later by jsonnet_string_unescape. This is OK |
578 | | // in this case because no information is lost by resoving the |
579 | | // repeated quote into a single quote, so we can go back to the |
580 | | // original form in the formatter. |
581 | 10.4k | case '@': { |
582 | 10.4k | c++; |
583 | 10.4k | if (*c != '"' && *c != '\'') { |
584 | 36 | std::stringstream ss; |
585 | 36 | ss << "couldn't lex verbatim string, junk after '@': " << *c; |
586 | 36 | throw StaticError(filename, begin, ss.str()); |
587 | 36 | } |
588 | 10.3k | const char quot = *c; |
589 | 10.3k | c++; // Advance beyond the opening quote. |
590 | 415k | for (;; ++c) { |
591 | 415k | if (*c == '\0') { |
592 | 74 | throw StaticError(filename, begin, "unterminated verbatim string"); |
593 | 74 | } |
594 | 414k | if (*c == quot) { |
595 | 13.1k | if (*(c + 1) == quot) { |
596 | 2.81k | c++; |
597 | 10.3k | } else { |
598 | 10.3k | break; |
599 | 10.3k | } |
600 | 13.1k | } |
601 | 404k | data += *c; |
602 | 404k | } |
603 | 10.3k | c++; // Advance beyond the closing quote. |
604 | 10.3k | if (quot == '"') { |
605 | 6.75k | kind = Token::VERBATIM_STRING_DOUBLE; |
606 | 6.75k | } else { |
607 | 3.56k | kind = Token::VERBATIM_STRING_SINGLE; |
608 | 3.56k | } |
609 | 10.3k | } break; |
610 | | |
611 | | // Keywords |
612 | 193M | default: |
613 | 193M | if (is_identifier_first(*c)) { |
614 | 143M | std::string id; |
615 | 790M | for (; is_identifier(*c); ++c) |
616 | 646M | id += *c; |
617 | 143M | kind = lex_get_keyword_kind(id); |
618 | 143M | data = id; |
619 | | |
620 | 143M | } else if (is_symbol(*c) || *c == '#') { |
621 | | // Single line C++ and Python style comments. |
622 | 49.9M | if (*c == '#' || (*c == '/' && *(c + 1) == '/')) { |
623 | 11.4M | std::vector<std::string> comment(1); |
624 | 11.4M | unsigned blanks; |
625 | 11.4M | unsigned indent; |
626 | 11.4M | lex_until_newline(c, comment[0], blanks, indent, line_start, line_number); |
627 | 11.4M | auto kind = fresh_line ? FodderElement::PARAGRAPH : FodderElement::LINE_END; |
628 | 11.4M | fodder.emplace_back(kind, blanks, indent, comment); |
629 | 11.4M | fresh_line = true; |
630 | 11.4M | continue; // We've not got a token, just fodder, so keep scanning. |
631 | 11.4M | } |
632 | | |
633 | | // Multi-line C style comment. |
634 | 38.5M | if (*c == '/' && *(c + 1) == '*') { |
635 | 1.51M | unsigned margin = c - line_start; |
636 | | |
637 | 1.51M | const char *initial_c = c; |
638 | 1.51M | c += 2; // Avoid matching /*/: skip the /* before starting the search for |
639 | | // */. |
640 | | |
641 | 138M | while (!(*c == '*' && *(c + 1) == '/')) { |
642 | 136M | if (*c == '\0') { |
643 | 218 | auto msg = "multi-line comment has no terminating */."; |
644 | 218 | throw StaticError(filename, begin, msg); |
645 | 218 | } |
646 | 136M | if (*c == '\n') { |
647 | | // Just keep track of the line / column counters. |
648 | 19.3M | line_number++; |
649 | 19.3M | line_start = c + 1; |
650 | 19.3M | } |
651 | 136M | ++c; |
652 | 136M | } |
653 | 1.51M | c += 2; // Move the pointer to the char after the closing '/'. |
654 | | |
655 | 1.51M | std::string comment(initial_c, |
656 | 1.51M | c - initial_c); // Includes the "/*" and "*/". |
657 | | |
658 | | // Lex whitespace after comment |
659 | 1.51M | unsigned new_lines_after, indent_after; |
660 | 1.51M | lex_ws(c, new_lines_after, indent_after, line_start, line_number); |
661 | 1.51M | std::vector<std::string> lines; |
662 | 1.51M | if (comment.find('\n') >= comment.length()) { |
663 | | // Comment looks like /* foo */ |
664 | 1.08M | lines.push_back(comment); |
665 | 1.08M | fodder.emplace_back(FodderElement::INTERSTITIAL, 0, 0, lines); |
666 | 1.08M | if (new_lines_after > 0) { |
667 | 937k | fodder.emplace_back(FodderElement::LINE_END, |
668 | 937k | new_lines_after - 1, |
669 | 937k | indent_after, |
670 | 937k | EMPTY); |
671 | 937k | fresh_line = true; |
672 | 937k | } |
673 | 1.08M | } else { |
674 | 428k | lines = line_split(comment, margin); |
675 | 428k | assert(lines[0][0] == '/'); |
676 | | // Little hack to support PARAGRAPHs with * down the LHS: |
677 | | // Add a space to lines that start with a '*' |
678 | 428k | bool all_star = true; |
679 | 19.7M | for (auto &l : lines) { |
680 | 19.7M | if (l[0] != '*') |
681 | 19.5M | all_star = false; |
682 | 19.7M | } |
683 | 428k | if (all_star) { |
684 | 0 | for (auto &l : lines) { |
685 | 0 | if (l[0] == '*') |
686 | 0 | l = " " + l; |
687 | 0 | } |
688 | 0 | } |
689 | 428k | if (new_lines_after == 0) { |
690 | | // Ensure a line end after the paragraph. |
691 | 27.3k | new_lines_after = 1; |
692 | 27.3k | indent_after = 0; |
693 | 27.3k | } |
694 | 428k | fodder_push_back(fodder, |
695 | 428k | FodderElement(FodderElement::PARAGRAPH, |
696 | 428k | new_lines_after - 1, |
697 | 428k | indent_after, |
698 | 428k | lines)); |
699 | 428k | fresh_line = true; |
700 | 428k | } |
701 | 1.51M | continue; // We've not got a token, just fodder, so keep scanning. |
702 | 1.51M | } |
703 | | |
704 | | // Text block |
705 | 37.0M | if (*c == '|' && *(c + 1) == '|' && *(c + 2) == '|') { |
706 | 17.6k | c += 3; // Skip the "|||". |
707 | | |
708 | 17.6k | bool chomp_trailing_nl = false; |
709 | 17.6k | if (*c == '-') { |
710 | 1.03k | chomp_trailing_nl = true; |
711 | 1.03k | c++; |
712 | 1.03k | } |
713 | | |
714 | 21.1k | while (is_horz_ws(*c)) ++c; // Chomp whitespace at end of line. |
715 | 17.6k | if (*c != '\n') { |
716 | 108 | auto msg = "text block syntax requires new line after |||."; |
717 | 108 | throw StaticError(filename, begin, msg); |
718 | 108 | } |
719 | 17.5k | std::stringstream block; |
720 | 17.5k | c++; // Skip the "\n" |
721 | 17.5k | line_number++; |
722 | | // Skip any blank lines at the beginning of the block. |
723 | 22.8k | while (*c == '\n') { |
724 | 5.37k | line_number++; |
725 | 5.37k | ++c; |
726 | 5.37k | block << '\n'; |
727 | 5.37k | } |
728 | 17.5k | line_start = c; |
729 | 17.5k | const char *first_line = c; |
730 | 17.5k | int ws_chars = whitespace_check(first_line, c); |
731 | 17.5k | string_block_indent = std::string(first_line, ws_chars); |
732 | 17.5k | if (ws_chars == 0) { |
733 | 83 | auto msg = "text block's first line must start with whitespace."; |
734 | 83 | throw StaticError(filename, begin, msg); |
735 | 83 | } |
736 | 22.1k | while (true) { |
737 | 22.1k | assert(ws_chars > 0); |
738 | | // Read up to the \n |
739 | 14.7M | for (c = &c[ws_chars]; *c != '\n'; ++c) { |
740 | 14.7M | if (*c == '\0') |
741 | 173 | throw StaticError(filename, begin, "unexpected EOF"); |
742 | 14.7M | block << *c; |
743 | 14.7M | } |
744 | | // Add the \n |
745 | 21.9k | block << '\n'; |
746 | 21.9k | ++c; |
747 | 21.9k | line_number++; |
748 | 21.9k | line_start = c; |
749 | | // Skip any blank lines |
750 | 24.6k | while (*c == '\n') { |
751 | 2.66k | line_number++; |
752 | 2.66k | ++c; |
753 | 2.66k | block << '\n'; |
754 | 2.66k | } |
755 | | // Examine next line |
756 | 21.9k | ws_chars = whitespace_check(first_line, c); |
757 | 21.9k | if (ws_chars == 0) { |
758 | | // End of text block |
759 | | // Skip over any whitespace |
760 | 91.0k | while (*c == ' ' || *c == '\t') { |
761 | 73.7k | string_block_term_indent += *c; |
762 | 73.7k | ++c; |
763 | 73.7k | } |
764 | | // Expect ||| |
765 | 17.2k | if (!(*c == '|' && *(c + 1) == '|' && *(c + 2) == '|')) { |
766 | 231 | auto msg = "text block not terminated with |||"; |
767 | 231 | throw StaticError(filename, begin, msg); |
768 | 231 | } |
769 | 17.0k | c += 3; // Leave after the last | |
770 | 17.0k | data = block.str(); |
771 | 17.0k | kind = Token::STRING_BLOCK; |
772 | 17.0k | if (chomp_trailing_nl) { |
773 | 1.01k | assert(data.back() == '\n'); |
774 | 1.01k | data.pop_back(); |
775 | 1.01k | } |
776 | 17.0k | break; // Out of the while loop. |
777 | 17.0k | } |
778 | 21.9k | } |
779 | | |
780 | 17.0k | break; // Out of the switch. |
781 | 17.4k | } |
782 | | |
783 | 37.0M | const char *operator_begin = c; |
784 | 120M | for (; is_symbol(*c); ++c) { |
785 | | // Not allowed // in operators |
786 | 83.6M | if (*c == '/' && *(c + 1) == '/') |
787 | 1.93k | break; |
788 | | // Not allowed /* in operators |
789 | 83.6M | if (*c == '/' && *(c + 1) == '*') |
790 | 1.55k | break; |
791 | | // Not allowed ||| in operators |
792 | 83.6M | if (*c == '|' && *(c + 1) == '|' && *(c + 2) == '|') |
793 | 2.04k | break; |
794 | 83.6M | } |
795 | | // Not allowed to end with a + - ~ ! unless a single char. |
796 | | // So, wind it back if we need to (but not too far). |
797 | 48.9M | while (c > operator_begin + 1 && !allowed_at_end_of_operator(*(c - 1))) { |
798 | 11.9M | c--; |
799 | 11.9M | } |
800 | 37.0M | data += std::string(operator_begin, c); |
801 | 37.0M | if (data == "$") { |
802 | 121k | kind = Token::DOLLAR; |
803 | 121k | data = ""; |
804 | 36.8M | } else { |
805 | 36.8M | kind = Token::OPERATOR; |
806 | 36.8M | } |
807 | 37.0M | } else { |
808 | 388 | std::stringstream ss; |
809 | 388 | ss << "Could not lex the character "; |
810 | 388 | auto uc = (unsigned char)(*c); |
811 | 388 | if (*c < 32) |
812 | 352 | ss << "code " << unsigned(uc); |
813 | 36 | else |
814 | 36 | ss << "'" << *c << "'"; |
815 | 388 | throw StaticError(filename, begin, ss.str()); |
816 | 388 | } |
817 | 352M | } |
818 | | |
819 | | // Ensure that a bug in the above code does not cause an infinite memory consuming loop due |
820 | | // to pushing empty tokens. |
821 | 339M | if (c == original_c) { |
822 | 0 | throw StaticError(filename, begin, "internal lexing error: pointer did not advance"); |
823 | 0 | } |
824 | | |
825 | 339M | Location end(line_number, (c + 1) - line_start); |
826 | 339M | r.emplace_back(kind, |
827 | 339M | fodder, |
828 | 339M | data, |
829 | 339M | string_block_indent, |
830 | 339M | string_block_term_indent, |
831 | 339M | LocationRange(filename, begin, end)); |
832 | 339M | fodder.clear(); |
833 | 339M | fresh_line = false; |
834 | 339M | } |
835 | | |
836 | 46.1k | Location begin(line_number, c - line_start + 1); |
837 | 46.1k | Location end(line_number, (c + 1) - line_start + 1); |
838 | 46.1k | r.emplace_back(Token::END_OF_FILE, fodder, "", "", "", LocationRange(filename, begin, end)); |
839 | 46.1k | return r; |
840 | 47.7k | } |
841 | | |
842 | | std::string jsonnet_unlex(const Tokens &tokens) |
843 | 0 | { |
844 | 0 | std::stringstream ss; |
845 | 0 | for (const auto &t : tokens) { |
846 | 0 | for (const auto &f : t.fodder) { |
847 | 0 | switch (f.kind) { |
848 | 0 | case FodderElement::LINE_END: { |
849 | 0 | if (f.comment.size() > 0) { |
850 | 0 | ss << "LineEnd(" << f.blanks << ", " << f.indent << ", " << f.comment[0] |
851 | 0 | << ")\n"; |
852 | 0 | } else { |
853 | 0 | ss << "LineEnd(" << f.blanks << ", " << f.indent << ")\n"; |
854 | 0 | } |
855 | 0 | } break; |
856 | | |
857 | 0 | case FodderElement::INTERSTITIAL: { |
858 | 0 | ss << "Interstitial(" << f.comment[0] << ")\n"; |
859 | 0 | } break; |
860 | | |
861 | 0 | case FodderElement::PARAGRAPH: { |
862 | 0 | ss << "Paragraph(\n"; |
863 | 0 | for (const auto &line : f.comment) { |
864 | 0 | ss << " " << line << '\n'; |
865 | 0 | } |
866 | 0 | ss << ")" << f.blanks << "\n"; |
867 | 0 | } break; |
868 | 0 | } |
869 | 0 | } |
870 | 0 | if (t.kind == Token::END_OF_FILE) { |
871 | 0 | ss << "EOF\n"; |
872 | 0 | break; |
873 | 0 | } |
874 | 0 | if (t.kind == Token::STRING_DOUBLE) { |
875 | 0 | ss << "\"" << t.data << "\"\n"; |
876 | 0 | } else if (t.kind == Token::STRING_SINGLE) { |
877 | 0 | ss << "'" << t.data << "'\n"; |
878 | 0 | } else if (t.kind == Token::STRING_BLOCK) { |
879 | 0 | ss << "|||\n"; |
880 | 0 | ss << t.stringBlockIndent; |
881 | 0 | for (const char *cp = t.data.c_str(); *cp != '\0'; ++cp) { |
882 | 0 | ss << *cp; |
883 | 0 | if (*cp == '\n' && *(cp + 1) != '\n' && *(cp + 1) != '\0') { |
884 | 0 | ss << t.stringBlockIndent; |
885 | 0 | } |
886 | 0 | } |
887 | 0 | ss << t.stringBlockTermIndent << "|||\n"; |
888 | 0 | } else { |
889 | 0 | ss << t.data << "\n"; |
890 | 0 | } |
891 | 0 | } |
892 | 0 | return ss.str(); |
893 | 0 | } |
894 | | |
895 | | } // namespace jsonnet::internal |