/src/jsonnet/core/lexer.cpp
Line | Count | Source |
1 | | /* |
2 | | Copyright 2015 Google Inc. All rights reserved. |
3 | | |
4 | | Licensed under the Apache License, Version 2.0 (the "License"); |
5 | | you may not use this file except in compliance with the License. |
6 | | You may obtain a copy of the License at |
7 | | |
8 | | http://www.apache.org/licenses/LICENSE-2.0 |
9 | | |
10 | | Unless required by applicable law or agreed to in writing, software |
11 | | distributed under the License is distributed on an "AS IS" BASIS, |
12 | | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | | See the License for the specific language governing permissions and |
14 | | limitations under the License. |
15 | | */ |
16 | | |
17 | | #include <cassert> |
18 | | |
19 | | #include <map> |
20 | | #include <sstream> |
21 | | #include <string> |
22 | | |
23 | | #include "lexer.h" |
24 | | #include "static_error.h" |
25 | | #include "unicode.h" |
26 | | |
27 | | namespace jsonnet::internal { |
28 | | |
29 | | static const std::vector<std::string> EMPTY; |
30 | | |
31 | | /** Is the char whitespace (excluding \n). */ |
32 | | static bool is_horz_ws(char c) |
33 | 806M | { |
34 | 806M | return c == ' ' || c == '\t' || c == '\r'; |
35 | 806M | } |
36 | | |
37 | | /** Is the char whitespace. */ |
38 | | static bool is_ws(char c) |
39 | 724M | { |
40 | 724M | return c == '\n' || is_horz_ws(c); |
41 | 724M | } |
42 | | |
43 | | /** Strip whitespace from both ends of a string, but only up to margin on the left hand side. */ |
44 | | static std::string strip_ws(const std::string &s, unsigned margin) |
45 | 19.5M | { |
46 | 19.5M | if (s.size() == 0) |
47 | 16.4M | return s; // Avoid underflow below. |
48 | 3.04M | size_t i = 0; |
49 | 8.22M | while (i < s.length() && is_horz_ws(s[i]) && i < margin) |
50 | 5.18M | i++; |
51 | 3.04M | size_t j = s.size(); |
52 | 7.18M | while (j > i && is_horz_ws(s[j - 1])) { |
53 | 4.13M | j--; |
54 | 4.13M | } |
55 | 3.04M | return std::string(&s[i], &s[j]); |
56 | 19.5M | } |
57 | | |
58 | | /** Split a string by \n and also strip left (up to margin) & right whitespace from each line. */ |
59 | | static std::vector<std::string> line_split(const std::string &s, unsigned margin) |
60 | 320k | { |
61 | 320k | std::vector<std::string> ret; |
62 | 320k | std::stringstream ss; |
63 | 118M | for (size_t i = 0; i < s.length(); ++i) { |
64 | 118M | if (s[i] == '\n') { |
65 | 19.2M | ret.emplace_back(strip_ws(ss.str(), margin)); |
66 | 19.2M | ss.str(""); |
67 | 99.2M | } else { |
68 | 99.2M | ss << s[i]; |
69 | 99.2M | } |
70 | 118M | } |
71 | 320k | ret.emplace_back(strip_ws(ss.str(), margin)); |
72 | 320k | return ret; |
73 | 320k | } |
74 | | |
75 | | /** Consume whitespace. |
76 | | * |
77 | | * Return number of \n and number of spaces after last \n. Convert \t to spaces. |
78 | | */ |
79 | | static void lex_ws(const char *&c, unsigned &new_lines, unsigned &indent, const char *&line_start, |
80 | | unsigned long &line_number) |
81 | 313M | { |
82 | 313M | indent = 0; |
83 | 313M | new_lines = 0; |
84 | 724M | for (; *c != '\0' && is_ws(*c); c++) { |
85 | 411M | switch (*c) { |
86 | 768k | case '\r': |
87 | | // Ignore. |
88 | 768k | break; |
89 | | |
90 | 45.5M | case '\n': |
91 | 45.5M | indent = 0; |
92 | 45.5M | new_lines++; |
93 | 45.5M | line_number++; |
94 | 45.5M | line_start = c + 1; |
95 | 45.5M | break; |
96 | | |
97 | 365M | case ' ': indent += 1; break; |
98 | | |
99 | | // This only works for \t at the beginning of lines, but we strip it everywhere else |
100 | | // anyway. The only case where this will cause a problem is spaces followed by \t |
101 | | // at the beginning of a line. However that is rare, ill-advised, and if re-indentation |
102 | | // is enabled it will be fixed later. |
103 | 44.3k | case '\t': indent += 8; break; |
104 | 411M | } |
105 | 411M | } |
106 | 313M | } |
107 | | |
108 | | /** |
109 | | # Consume all text until the end of the line, return number of newlines after that and indent |
110 | | */ |
111 | | static void lex_until_newline(const char *&c, std::string &text, unsigned &blanks, unsigned &indent, |
112 | | const char *&line_start, unsigned long &line_number) |
113 | 8.23M | { |
114 | 8.23M | const char *original_c = c; |
115 | 8.23M | const char *last_non_space = c; |
116 | 120M | for (; *c != '\0' && *c != '\n'; c++) { |
117 | 112M | if (!is_horz_ws(*c)) |
118 | 96.2M | last_non_space = c; |
119 | 112M | } |
120 | 8.23M | text = std::string(original_c, last_non_space - original_c + 1); |
121 | | // Consume subsequent whitespace including the '\n'. |
122 | 8.23M | unsigned new_lines; |
123 | 8.23M | lex_ws(c, new_lines, indent, line_start, line_number); |
124 | 8.23M | blanks = new_lines == 0 ? 0 : new_lines - 1; |
125 | 8.23M | } |
126 | | |
127 | | static bool is_upper(char c) |
128 | 864M | { |
129 | 864M | return c >= 'A' && c <= 'Z'; |
130 | 864M | } |
131 | | |
132 | | static bool is_lower(char c) |
133 | 850M | { |
134 | 850M | return c >= 'a' && c <= 'z'; |
135 | 850M | } |
136 | | |
137 | | static bool is_number(char c) |
138 | 154M | { |
139 | 154M | return c >= '0' && c <= '9'; |
140 | 154M | } |
141 | | |
142 | | static bool is_identifier_first(char c) |
143 | 864M | { |
144 | 864M | return is_upper(c) || is_lower(c) || c == '_'; |
145 | 864M | } |
146 | | |
147 | | static bool is_identifier(char c) |
148 | 694M | { |
149 | 694M | return is_identifier_first(c) || is_number(c); |
150 | 694M | } |
151 | | |
152 | | static bool is_symbol(char c) |
153 | 145M | { |
154 | 145M | switch (c) { |
155 | 3.12M | case '!': |
156 | 3.54M | case '$': |
157 | 19.1M | case ':': |
158 | 20.2M | case '~': |
159 | 40.7M | case '+': |
160 | 43.4M | case '-': |
161 | 46.2M | case '&': |
162 | 47.8M | case '|': |
163 | 47.9M | case '^': |
164 | 76.8M | case '=': |
165 | 79.5M | case '<': |
166 | 83.0M | case '>': |
167 | 101M | case '*': |
168 | 105M | case '/': |
169 | 106M | case '%': return true; |
170 | 145M | } |
171 | 38.9M | return false; |
172 | 145M | } |
173 | | |
174 | 14.9M | bool allowed_at_end_of_operator(char c) { |
175 | 14.9M | switch (c) { |
176 | 3.72M | case '+': |
177 | 4.02M | case '-': |
178 | 5.00M | case '~': |
179 | 5.56M | case '!': |
180 | 5.75M | case '$': return false; |
181 | 14.9M | } |
182 | 9.16M | return true; |
183 | 14.9M | } |
184 | | |
185 | | static const std::map<std::string, Token::Kind> keywords = { |
186 | | {"assert", Token::ASSERT}, |
187 | | {"else", Token::ELSE}, |
188 | | {"error", Token::ERROR}, |
189 | | {"false", Token::FALSE}, |
190 | | {"for", Token::FOR}, |
191 | | {"function", Token::FUNCTION}, |
192 | | {"if", Token::IF}, |
193 | | {"import", Token::IMPORT}, |
194 | | {"importstr", Token::IMPORTSTR}, |
195 | | {"importbin", Token::IMPORTBIN}, |
196 | | {"in", Token::IN}, |
197 | | {"local", Token::LOCAL}, |
198 | | {"null", Token::NULL_LIT}, |
199 | | {"self", Token::SELF}, |
200 | | {"super", Token::SUPER}, |
201 | | {"tailstrict", Token::TAILSTRICT}, |
202 | | {"then", Token::THEN}, |
203 | | {"true", Token::TRUE}, |
204 | | }; |
205 | | |
206 | | Token::Kind lex_get_keyword_kind(const std::string &identifier) |
207 | 128M | { |
208 | 128M | auto it = keywords.find(identifier); |
209 | 128M | if (it == keywords.end()) |
210 | 93.9M | return Token::IDENTIFIER; |
211 | 34.3M | return it->second; |
212 | 128M | } |
213 | | |
214 | | std::string lex_number(const char *&c, const std::string &filename, const Location &begin) |
215 | 13.4M | { |
216 | | // This function should be understood with reference to the linked image: |
217 | | // https://www.json.org/img/number.png |
218 | | |
219 | | // Note, we deviate from the json.org documentation as follows: |
220 | | // * There is no reason to lex negative numbers as atomic tokens, it is better to parse them |
221 | | // as a unary operator combined with a numeric literal. This avoids x-1 being tokenized as |
222 | | // <identifier> <number> instead of the intended <identifier> <binop> <number>. |
223 | | // * We support digit separators using the _ character for readability in |
224 | | // large numeric literals. |
225 | | |
226 | 13.4M | enum State { |
227 | 13.4M | BEGIN, |
228 | 13.4M | AFTER_ZERO, |
229 | 13.4M | AFTER_ONE_TO_NINE, |
230 | 13.4M | AFTER_INT_UNDERSCORE, |
231 | 13.4M | AFTER_DOT, |
232 | 13.4M | AFTER_DIGIT, |
233 | 13.4M | AFTER_FRAC_UNDERSCORE, |
234 | 13.4M | AFTER_E, |
235 | 13.4M | AFTER_EXP_SIGN, |
236 | 13.4M | AFTER_EXP_DIGIT, |
237 | 13.4M | AFTER_EXP_UNDERSCORE |
238 | 13.4M | } state; |
239 | | |
240 | 13.4M | std::string r; |
241 | | |
242 | 13.4M | state = BEGIN; |
243 | 30.7M | while (true) { |
244 | 30.7M | switch (state) { |
245 | 13.4M | case BEGIN: |
246 | 13.4M | switch (*c) { |
247 | 4.09M | case '0': state = AFTER_ZERO; break; |
248 | | |
249 | 3.93M | case '1': |
250 | 4.72M | case '2': |
251 | 5.36M | case '3': |
252 | 6.15M | case '4': |
253 | 6.31M | case '5': |
254 | 6.80M | case '6': |
255 | 7.11M | case '7': |
256 | 8.39M | case '8': |
257 | 9.34M | case '9': state = AFTER_ONE_TO_NINE; break; |
258 | | |
259 | 0 | default: throw StaticError(filename, begin, "couldn't lex number"); |
260 | 13.4M | } |
261 | 13.4M | break; |
262 | | |
263 | 13.4M | case AFTER_ZERO: |
264 | 4.09M | switch (*c) { |
265 | 44.9k | case '.': state = AFTER_DOT; break; |
266 | | |
267 | 1.34k | case 'e': |
268 | 2.48k | case 'E': state = AFTER_E; break; |
269 | | |
270 | 5 | case '_': { |
271 | 5 | std::stringstream ss; |
272 | 5 | ss << "couldn't lex number, _ not allowed after leading 0"; |
273 | 5 | throw StaticError(filename, begin, ss.str()); |
274 | 1.34k | } |
275 | | |
276 | 4.04M | default: goto end; |
277 | 4.09M | } |
278 | 47.3k | break; |
279 | | |
280 | 12.3M | case AFTER_ONE_TO_NINE: |
281 | 12.3M | switch (*c) { |
282 | 27.5k | case '.': state = AFTER_DOT; break; |
283 | | |
284 | 2.66k | case 'e': |
285 | 5.09k | case 'E': state = AFTER_E; break; |
286 | | |
287 | 1.23M | case '0': |
288 | 1.27M | case '1': |
289 | 1.52M | case '2': |
290 | 1.61M | case '3': |
291 | 1.72M | case '4': |
292 | 1.94M | case '5': |
293 | 2.08M | case '6': |
294 | 2.15M | case '7': |
295 | 2.22M | case '8': |
296 | 3.01M | case '9': state = AFTER_ONE_TO_NINE; break; |
297 | | |
298 | 749 | case '_': state = AFTER_INT_UNDERSCORE; goto skip_char; |
299 | | |
300 | 9.30M | default: goto end; |
301 | 12.3M | } |
302 | 3.04M | break; |
303 | | |
304 | 3.04M | case AFTER_INT_UNDERSCORE: |
305 | 749 | switch (*c) { |
306 | | // The only valid transition from _ is to a digit. |
307 | 386 | case '0': |
308 | 454 | case '1': |
309 | 478 | case '2': |
310 | 481 | case '3': |
311 | 489 | case '4': |
312 | 639 | case '5': |
313 | 640 | case '6': |
314 | 640 | case '7': |
315 | 729 | case '8': |
316 | 729 | case '9': state = AFTER_ONE_TO_NINE; break; |
317 | | |
318 | 20 | default: { |
319 | 20 | std::stringstream ss; |
320 | 20 | ss << "couldn't lex number, junk after _: " << *c; |
321 | 20 | throw StaticError(filename, begin, ss.str()); |
322 | 729 | } |
323 | 749 | } |
324 | 729 | break; |
325 | | |
326 | 72.4k | case AFTER_DOT: |
327 | 72.4k | switch (*c) { |
328 | 2.26k | case '0': |
329 | 25.1k | case '1': |
330 | 26.0k | case '2': |
331 | 27.0k | case '3': |
332 | 27.4k | case '4': |
333 | 70.3k | case '5': |
334 | 70.9k | case '6': |
335 | 71.0k | case '7': |
336 | 71.7k | case '8': |
337 | 72.4k | case '9': state = AFTER_DIGIT; break; |
338 | | |
339 | 23 | default: { |
340 | 23 | std::stringstream ss; |
341 | 23 | ss << "couldn't lex number, junk after decimal point: " << *c; |
342 | 23 | throw StaticError(filename, begin, ss.str()); |
343 | 71.7k | } |
344 | 72.4k | } |
345 | 72.4k | break; |
346 | | |
347 | 581k | case AFTER_DIGIT: |
348 | 581k | switch (*c) { |
349 | 996 | case 'e': |
350 | 2.36k | case 'E': state = AFTER_E; break; |
351 | | |
352 | 106k | case '0': |
353 | 177k | case '1': |
354 | 202k | case '2': |
355 | 249k | case '3': |
356 | 274k | case '4': |
357 | 343k | case '5': |
358 | 389k | case '6': |
359 | 414k | case '7': |
360 | 441k | case '8': |
361 | 508k | case '9': state = AFTER_DIGIT; break; |
362 | | |
363 | 744 | case '_': state = AFTER_FRAC_UNDERSCORE; goto skip_char; |
364 | | |
365 | 70.0k | default: goto end; |
366 | 581k | } |
367 | 511k | break; |
368 | | |
369 | 511k | case AFTER_FRAC_UNDERSCORE: |
370 | 744 | switch (*c) { |
371 | | // The only valid transition from _ is to a digit. |
372 | 194 | case '0': |
373 | 231 | case '1': |
374 | 550 | case '2': |
375 | 552 | case '3': |
376 | 558 | case '4': |
377 | 600 | case '5': |
378 | 601 | case '6': |
379 | 601 | case '7': |
380 | 729 | case '8': |
381 | 729 | case '9': state = AFTER_DIGIT; break; |
382 | | |
383 | 15 | default: { |
384 | 15 | std::stringstream ss; |
385 | 15 | ss << "couldn't lex number, junk after _: " << *c; |
386 | 15 | throw StaticError(filename, begin, ss.str()); |
387 | 729 | } |
388 | 744 | } |
389 | 729 | break; |
390 | | |
391 | 9.94k | case AFTER_E: |
392 | 9.94k | switch (*c) { |
393 | 1.61k | case '+': |
394 | 3.57k | case '-': state = AFTER_EXP_SIGN; break; |
395 | | |
396 | 1.72k | case '0': |
397 | 2.63k | case '1': |
398 | 3.96k | case '2': |
399 | 4.37k | case '3': |
400 | 4.47k | case '4': |
401 | 5.02k | case '5': |
402 | 5.56k | case '6': |
403 | 5.64k | case '7': |
404 | 5.82k | case '8': |
405 | 6.30k | case '9': state = AFTER_EXP_DIGIT; break; |
406 | | |
407 | 69 | default: { |
408 | 69 | std::stringstream ss; |
409 | 69 | ss << "couldn't lex number, junk after 'E': " << *c; |
410 | 69 | throw StaticError(filename, begin, ss.str()); |
411 | 5.82k | } |
412 | 9.94k | } |
413 | 9.88k | break; |
414 | | |
415 | 9.88k | case AFTER_EXP_SIGN: |
416 | 3.57k | switch (*c) { |
417 | 991 | case '0': |
418 | 1.42k | case '1': |
419 | 1.74k | case '2': |
420 | 2.93k | case '3': |
421 | 3.27k | case '4': |
422 | 3.28k | case '5': |
423 | 3.33k | case '6': |
424 | 3.37k | case '7': |
425 | 3.47k | case '8': |
426 | 3.55k | case '9': state = AFTER_EXP_DIGIT; break; |
427 | | |
428 | 16 | default: { |
429 | 16 | std::stringstream ss; |
430 | 16 | ss << "couldn't lex number, junk after exponent sign: " << *c; |
431 | 16 | throw StaticError(filename, begin, ss.str()); |
432 | 3.47k | } |
433 | 3.57k | } |
434 | 3.55k | break; |
435 | | |
436 | 209k | case AFTER_EXP_DIGIT: |
437 | 209k | switch (*c) { |
438 | 166k | case '0': |
439 | 171k | case '1': |
440 | 174k | case '2': |
441 | 177k | case '3': |
442 | 182k | case '4': |
443 | 184k | case '5': |
444 | 187k | case '6': |
445 | 191k | case '7': |
446 | 196k | case '8': |
447 | 199k | case '9': state = AFTER_EXP_DIGIT; break; |
448 | | |
449 | 587 | case '_': state = AFTER_EXP_UNDERSCORE; goto skip_char; |
450 | | |
451 | 9.85k | default: goto end; |
452 | 209k | } |
453 | 199k | break; |
454 | | |
455 | 199k | case AFTER_EXP_UNDERSCORE: |
456 | 587 | switch (*c) { |
457 | | // The only valid transition from _ is to a digit. |
458 | 131 | case '0': |
459 | 472 | case '1': |
460 | 476 | case '2': |
461 | 476 | case '3': |
462 | 484 | case '4': |
463 | 488 | case '5': |
464 | 501 | case '6': |
465 | 506 | case '7': |
466 | 562 | case '8': |
467 | 575 | case '9': state = AFTER_EXP_DIGIT; break; |
468 | | |
469 | 12 | default: { |
470 | 12 | std::stringstream ss; |
471 | 12 | ss << "couldn't lex number, junk after _: " << *c; |
472 | 12 | throw StaticError(filename, begin, ss.str()); |
473 | 562 | } |
474 | 587 | } |
475 | 575 | break; |
476 | 30.7M | } |
477 | 17.3M | r += *c; |
478 | | |
479 | 17.3M | skip_char: |
480 | 17.3M | c++; |
481 | 17.3M | } |
482 | 13.4M | end: |
483 | 13.4M | return r; |
484 | 13.4M | } |
485 | | |
486 | | // Check that b has at least the same whitespace prefix as a and returns the amount of this |
487 | | // whitespace, otherwise returns 0. If a has no whitespace prefix than return 0. |
488 | | static int whitespace_check(const char *a, const char *b) |
489 | 37.7k | { |
490 | 37.7k | int i = 0; |
491 | 1.09M | while (a[i] == ' ' || a[i] == '\t') { |
492 | 1.07M | if (b[i] != a[i]) |
493 | 15.8k | return 0; |
494 | 1.05M | i++; |
495 | 1.05M | } |
496 | 21.9k | return i; |
497 | 37.7k | } |
498 | | |
499 | 230 | static void describe_whitespace(std::stringstream& msg, const std::string& ws) { |
500 | 230 | int spaces = 0; |
501 | 230 | int tabs = 0; |
502 | 489k | for (char c : ws) { |
503 | 489k | if (c == ' ') |
504 | 29.8k | spaces++; |
505 | 459k | else if (c == '\t') |
506 | 459k | tabs++; |
507 | 489k | } |
508 | 230 | if (spaces > 0 && tabs > 0) { |
509 | 55 | msg << spaces << (spaces == 1 ? " space" : " spaces") << " and " << tabs |
510 | 55 | << (tabs == 1 ? " tab" : " tabs"); |
511 | 175 | } else if (spaces > 0) { |
512 | 73 | msg << spaces << (spaces == 1 ? " space" : " spaces"); |
513 | 102 | } else if (tabs > 0) { |
514 | 102 | msg << tabs << (tabs == 1 ? " tab" : " tabs"); |
515 | 102 | } else { |
516 | 0 | msg << "no indentation"; |
517 | 0 | } |
518 | 230 | } |
519 | | |
520 | | Tokens jsonnet_lex(const std::string &filename, const char *input) |
521 | 42.5k | { |
522 | 42.5k | unsigned long line_number = 1; |
523 | 42.5k | const char *line_start = input; |
524 | | |
525 | 42.5k | Tokens r; |
526 | | |
527 | 42.5k | const char *c = input; |
528 | | |
529 | 42.5k | Fodder fodder; |
530 | 42.5k | bool fresh_line = true; // Are we tokenizing from the beginning of a new line? |
531 | | |
532 | 304M | while (*c != '\0') { |
533 | | // Used to ensure we have actually advanced the pointer by the end of the iteration. |
534 | 304M | const char *original_c = c; |
535 | | |
536 | 304M | Token::Kind kind; |
537 | 304M | std::string data; |
538 | 304M | std::string string_block_indent; |
539 | 304M | std::string string_block_term_indent; |
540 | | |
541 | 304M | unsigned new_lines, indent; |
542 | 304M | lex_ws(c, new_lines, indent, line_start, line_number); |
543 | | |
544 | | // If it's the end of the file, discard final whitespace. |
545 | 304M | if (*c == '\0') |
546 | 21.9k | break; |
547 | | |
548 | 304M | if (new_lines > 0) { |
549 | | // Otherwise store whitespace in fodder. |
550 | 32.1M | unsigned blanks = new_lines - 1; |
551 | 32.1M | fodder.emplace_back(FodderElement::LINE_END, blanks, indent, EMPTY); |
552 | 32.1M | fresh_line = true; |
553 | 32.1M | } |
554 | | |
555 | 304M | Location begin(line_number, c - line_start + 1); |
556 | | |
557 | 304M | switch (*c) { |
558 | | // The following operators should never be combined with subsequent symbols. |
559 | 1.31M | case '{': |
560 | 1.31M | kind = Token::BRACE_L; |
561 | 1.31M | c++; |
562 | 1.31M | break; |
563 | | |
564 | 1.29M | case '}': |
565 | 1.29M | kind = Token::BRACE_R; |
566 | 1.29M | c++; |
567 | 1.29M | break; |
568 | | |
569 | 6.81M | case '[': |
570 | 6.81M | kind = Token::BRACKET_L; |
571 | 6.81M | c++; |
572 | 6.81M | break; |
573 | | |
574 | 6.77M | case ']': |
575 | 6.77M | kind = Token::BRACKET_R; |
576 | 6.77M | c++; |
577 | 6.77M | break; |
578 | | |
579 | 26.0M | case ',': |
580 | 26.0M | kind = Token::COMMA; |
581 | 26.0M | c++; |
582 | 26.0M | break; |
583 | | |
584 | 15.3M | case '.': |
585 | 15.3M | kind = Token::DOT; |
586 | 15.3M | c++; |
587 | 15.3M | break; |
588 | | |
589 | 23.1M | case '(': |
590 | 23.1M | kind = Token::PAREN_L; |
591 | 23.1M | c++; |
592 | 23.1M | break; |
593 | | |
594 | 23.1M | case ')': |
595 | 23.1M | kind = Token::PAREN_R; |
596 | 23.1M | c++; |
597 | 23.1M | break; |
598 | | |
599 | 6.06M | case ';': |
600 | 6.06M | kind = Token::SEMICOLON; |
601 | 6.06M | c++; |
602 | 6.06M | break; |
603 | | |
604 | | // Numeric literals. |
605 | 4.09M | case '0': |
606 | 8.02M | case '1': |
607 | 8.81M | case '2': |
608 | 9.46M | case '3': |
609 | 10.2M | case '4': |
610 | 10.4M | case '5': |
611 | 10.9M | case '6': |
612 | 11.2M | case '7': |
613 | 12.4M | case '8': |
614 | 13.4M | case '9': |
615 | 13.4M | kind = Token::NUMBER; |
616 | 13.4M | data = lex_number(c, filename, begin); |
617 | 13.4M | break; |
618 | | |
619 | | // UString literals. |
620 | 361k | case '"': { |
621 | 361k | c++; |
622 | 70.4M | for (;; ++c) { |
623 | 70.4M | if (*c == '\0') { |
624 | 68 | throw StaticError(filename, begin, "unterminated string"); |
625 | 68 | } |
626 | 70.4M | if (*c == '"') { |
627 | 361k | break; |
628 | 361k | } |
629 | 70.0M | if (*c == '\\' && *(c + 1) != '\0') { |
630 | 217k | data += *c; |
631 | 217k | ++c; |
632 | 217k | } |
633 | 70.0M | if (*c == '\n') { |
634 | | // Maintain line/column counters. |
635 | 5.62M | line_number++; |
636 | 5.62M | line_start = c + 1; |
637 | 5.62M | } |
638 | 70.0M | data += *c; |
639 | 70.0M | } |
640 | 361k | c++; // Advance beyond the ". |
641 | 361k | kind = Token::STRING_DOUBLE; |
642 | 361k | } break; |
643 | | |
644 | | // UString literals. |
645 | 10.0M | case '\'': { |
646 | 10.0M | c++; |
647 | 139M | for (;; ++c) { |
648 | 139M | if (*c == '\0') { |
649 | 71 | throw StaticError(filename, begin, "unterminated string"); |
650 | 71 | } |
651 | 139M | if (*c == '\'') { |
652 | 10.0M | break; |
653 | 10.0M | } |
654 | 129M | if (*c == '\\' && *(c + 1) != '\0') { |
655 | 1.11M | data += *c; |
656 | 1.11M | ++c; |
657 | 1.11M | } |
658 | 129M | if (*c == '\n') { |
659 | | // Maintain line/column counters. |
660 | 4.46M | line_number++; |
661 | 4.46M | line_start = c + 1; |
662 | 4.46M | } |
663 | 129M | data += *c; |
664 | 129M | } |
665 | 10.0M | c++; // Advance beyond the '. |
666 | 10.0M | kind = Token::STRING_SINGLE; |
667 | 10.0M | } break; |
668 | | |
669 | | // Verbatim string literals. |
670 | | // ' and " quoting is interpreted here, unlike non-verbatim strings |
671 | | // where it is done later by jsonnet_string_unescape. This is OK |
672 | | // in this case because no information is lost by resoving the |
673 | | // repeated quote into a single quote, so we can go back to the |
674 | | // original form in the formatter. |
675 | 11.2k | case '@': { |
676 | 11.2k | c++; |
677 | 11.2k | if (*c != '"' && *c != '\'') { |
678 | 43 | std::stringstream ss; |
679 | 43 | ss << "couldn't lex verbatim string, junk after '@': " << *c; |
680 | 43 | throw StaticError(filename, begin, ss.str()); |
681 | 43 | } |
682 | 11.1k | const char quot = *c; |
683 | 11.1k | c++; // Advance beyond the opening quote. |
684 | 210k | for (;; ++c) { |
685 | 210k | if (*c == '\0') { |
686 | 74 | throw StaticError(filename, begin, "unterminated verbatim string"); |
687 | 74 | } |
688 | 210k | if (*c == quot) { |
689 | 14.0k | if (*(c + 1) == quot) { |
690 | 2.95k | c++; |
691 | 11.1k | } else { |
692 | 11.1k | break; |
693 | 11.1k | } |
694 | 14.0k | } |
695 | 199k | data += *c; |
696 | 199k | } |
697 | 11.1k | c++; // Advance beyond the closing quote. |
698 | 11.1k | if (quot == '"') { |
699 | 7.32k | kind = Token::VERBATIM_STRING_DOUBLE; |
700 | 7.32k | } else { |
701 | 3.77k | kind = Token::VERBATIM_STRING_SINGLE; |
702 | 3.77k | } |
703 | 11.1k | } break; |
704 | | |
705 | | // Keywords |
706 | 170M | default: |
707 | 170M | if (is_identifier_first(*c)) { |
708 | 128M | std::string id; |
709 | 694M | for (; is_identifier(*c); ++c) |
710 | 566M | id += *c; |
711 | 128M | kind = lex_get_keyword_kind(id); |
712 | 128M | data = id; |
713 | | |
714 | 128M | } else if (is_symbol(*c) || *c == '#') { |
715 | | // Single line C++ and Python style comments. |
716 | 41.8M | if (*c == '#' || (*c == '/' && *(c + 1) == '/')) { |
717 | 8.23M | std::vector<std::string> comment(1); |
718 | 8.23M | unsigned blanks; |
719 | 8.23M | unsigned indent; |
720 | 8.23M | lex_until_newline(c, comment[0], blanks, indent, line_start, line_number); |
721 | 8.23M | auto kind = fresh_line ? FodderElement::PARAGRAPH : FodderElement::LINE_END; |
722 | 8.23M | fodder.emplace_back(kind, blanks, indent, comment); |
723 | 8.23M | fresh_line = true; |
724 | 8.23M | continue; // We've not got a token, just fodder, so keep scanning. |
725 | 8.23M | } |
726 | | |
727 | | // Multi-line C style comment. |
728 | 33.6M | if (*c == '/' && *(c + 1) == '*') { |
729 | 855k | unsigned margin = c - line_start; |
730 | | |
731 | 855k | const char *initial_c = c; |
732 | 855k | c += 2; // Avoid matching /*/: skip the /* before starting the search for |
733 | | // */. |
734 | | |
735 | 128M | while (!(*c == '*' && *(c + 1) == '/')) { |
736 | 127M | if (*c == '\0') { |
737 | 189 | auto msg = "multi-line comment has no terminating */."; |
738 | 189 | throw StaticError(filename, begin, msg); |
739 | 189 | } |
740 | 127M | if (*c == '\n') { |
741 | | // Just keep track of the line / column counters. |
742 | 19.2M | line_number++; |
743 | 19.2M | line_start = c + 1; |
744 | 19.2M | } |
745 | 127M | ++c; |
746 | 127M | } |
747 | 855k | c += 2; // Move the pointer to the char after the closing '/'. |
748 | | |
749 | 855k | std::string comment(initial_c, |
750 | 855k | c - initial_c); // Includes the "/*" and "*/". |
751 | | |
752 | | // Lex whitespace after comment |
753 | 855k | unsigned new_lines_after, indent_after; |
754 | 855k | lex_ws(c, new_lines_after, indent_after, line_start, line_number); |
755 | 855k | std::vector<std::string> lines; |
756 | 855k | if (comment.find('\n') >= comment.length()) { |
757 | | // Comment looks like /* foo */ |
758 | 535k | lines.push_back(comment); |
759 | 535k | fodder.emplace_back(FodderElement::INTERSTITIAL, 0, 0, lines); |
760 | 535k | if (new_lines_after > 0) { |
761 | 521k | fodder.emplace_back(FodderElement::LINE_END, |
762 | 521k | new_lines_after - 1, |
763 | 521k | indent_after, |
764 | 521k | EMPTY); |
765 | 521k | fresh_line = true; |
766 | 521k | } |
767 | 535k | } else { |
768 | 320k | lines = line_split(comment, margin); |
769 | 320k | assert(lines[0][0] == '/'); |
770 | | // Little hack to support PARAGRAPHs with * down the LHS: |
771 | | // Add a space to lines that start with a '*' |
772 | 320k | bool all_star = true; |
773 | 19.5M | for (auto &l : lines) { |
774 | 19.5M | if (l[0] != '*') |
775 | 19.3M | all_star = false; |
776 | 19.5M | } |
777 | 320k | if (all_star) { |
778 | 0 | for (auto &l : lines) { |
779 | 0 | if (l[0] == '*') |
780 | 0 | l = " " + l; |
781 | 0 | } |
782 | 0 | } |
783 | 320k | if (new_lines_after == 0) { |
784 | | // Ensure a line end after the paragraph. |
785 | 27.5k | new_lines_after = 1; |
786 | 27.5k | indent_after = 0; |
787 | 27.5k | } |
788 | 320k | fodder_push_back(fodder, |
789 | 320k | FodderElement(FodderElement::PARAGRAPH, |
790 | 320k | new_lines_after - 1, |
791 | 320k | indent_after, |
792 | 320k | lines)); |
793 | 320k | fresh_line = true; |
794 | 320k | } |
795 | 855k | continue; // We've not got a token, just fodder, so keep scanning. |
796 | 855k | } |
797 | | |
798 | | // Text block |
799 | 32.7M | if (*c == '|' && *(c + 1) == '|' && *(c + 2) == '|') { |
800 | 16.2k | c += 3; // Skip the "|||". |
801 | | |
802 | 16.2k | bool chomp_trailing_nl = false; |
803 | 16.2k | if (*c == '-') { |
804 | 1.70k | chomp_trailing_nl = true; |
805 | 1.70k | c++; |
806 | 1.70k | } |
807 | | |
808 | 20.5k | while (is_horz_ws(*c)) ++c; // Chomp whitespace at end of line. |
809 | 16.2k | if (*c != '\n') { |
810 | 117 | auto msg = "text block syntax requires new line after |||."; |
811 | 117 | throw StaticError(filename, begin, msg); |
812 | 117 | } |
813 | 16.0k | std::stringstream block; |
814 | 16.0k | c++; // Skip the "\n" |
815 | 16.0k | line_number++; |
816 | | // Skip any blank lines at the beginning of the block. |
817 | 19.9k | while (*c == '\n') { |
818 | 3.84k | line_number++; |
819 | 3.84k | ++c; |
820 | 3.84k | block << '\n'; |
821 | 3.84k | } |
822 | 16.0k | line_start = c; |
823 | 16.0k | const char *first_line = c; |
824 | 16.0k | int ws_chars = whitespace_check(first_line, c); |
825 | 16.0k | string_block_indent = std::string(first_line, ws_chars); |
826 | 16.0k | if (ws_chars == 0) { |
827 | 64 | auto msg = "text block's first line must start with whitespace."; |
828 | 64 | throw StaticError(filename, begin, msg); |
829 | 64 | } |
830 | 21.8k | while (true) { |
831 | 21.8k | assert(ws_chars > 0); |
832 | | // Read up to the \n |
833 | 9.70M | for (c = &c[ws_chars]; *c != '\n'; ++c) { |
834 | 9.68M | if (*c == '\0') |
835 | 184 | throw StaticError(filename, begin, "unexpected EOF"); |
836 | 9.68M | block << *c; |
837 | 9.68M | } |
838 | | // Add the \n |
839 | 21.6k | block << '\n'; |
840 | 21.6k | ++c; |
841 | 21.6k | line_number++; |
842 | 21.6k | line_start = c; |
843 | | // Skip any blank lines |
844 | 25.1k | while (*c == '\n') { |
845 | 3.43k | line_number++; |
846 | 3.43k | ++c; |
847 | 3.43k | block << '\n'; |
848 | 3.43k | } |
849 | | // Examine next line |
850 | 21.6k | ws_chars = whitespace_check(first_line, c); |
851 | 21.6k | if (ws_chars == 0) { |
852 | | // End of text block (or indentation error). |
853 | | // Count actual whitespace on this line. |
854 | 15.8k | int actual_ws = 0; |
855 | 215k | while (c[actual_ws] == ' ' || |
856 | 199k | c[actual_ws] == '\t') { |
857 | 199k | actual_ws++; |
858 | 199k | } |
859 | | |
860 | | // Check if this is the terminator ||| |
861 | 15.8k | bool is_terminator = ( |
862 | 15.8k | c[actual_ws] == '|' && |
863 | 15.6k | c[actual_ws + 1] == '|' && |
864 | 15.6k | c[actual_ws + 2] == '|'); |
865 | | |
866 | 15.8k | if (!is_terminator) { |
867 | | // Not a terminator - check if it's an |
868 | | // indentation issue. |
869 | 244 | if (actual_ws > 0) { |
870 | | // Has whitespace but doesn't match expected |
871 | | // indentation. |
872 | 115 | std::stringstream msg; |
873 | 115 | msg << "text block indentation mismatch: " |
874 | 115 | "expected at least "; |
875 | 115 | describe_whitespace(msg, string_block_indent); |
876 | 115 | msg << ", found "; |
877 | 115 | describe_whitespace(msg, std::string(c, actual_ws)); |
878 | 115 | throw StaticError(filename, begin, msg.str()); |
879 | 129 | } else { |
880 | | // No whitespace and no ||| - missing |
881 | | // terminator. |
882 | 129 | auto msg = |
883 | 129 | "text block not terminated with |||"; |
884 | 129 | throw StaticError(filename, begin, msg); |
885 | 129 | } |
886 | 244 | } |
887 | | |
888 | | // Valid termination - skip over any whitespace. |
889 | 107k | while (*c == ' ' || *c == '\t') { |
890 | 92.0k | string_block_term_indent += *c; |
891 | 92.0k | ++c; |
892 | 92.0k | } |
893 | | // Skip the ||| |
894 | 15.6k | c += 3; // Leave after the last | |
895 | 15.6k | data = block.str(); |
896 | 15.6k | kind = Token::STRING_BLOCK; |
897 | 15.6k | if (chomp_trailing_nl) { |
898 | 1.66k | assert(data.back() == '\n'); |
899 | 1.66k | data.pop_back(); |
900 | 1.66k | } |
901 | 15.6k | break; // Out of the while loop. |
902 | 15.6k | } |
903 | 21.6k | } |
904 | | |
905 | 15.6k | break; // Out of the switch. |
906 | 16.0k | } |
907 | | |
908 | 32.7M | const char *operator_begin = c; |
909 | 103M | for (; is_symbol(*c); ++c) { |
910 | | // Not allowed // in operators |
911 | 71.1M | if (*c == '/' && *(c + 1) == '/') |
912 | 1.30k | break; |
913 | | // Not allowed /* in operators |
914 | 71.1M | if (*c == '/' && *(c + 1) == '*') |
915 | 1.45k | break; |
916 | | // Not allowed ||| in operators |
917 | 71.1M | if (*c == '|' && *(c + 1) == '|' && *(c + 2) == '|') |
918 | 2.26k | break; |
919 | 71.1M | } |
920 | | // Not allowed to end with a + - ~ ! unless a single char. |
921 | | // So, wind it back if we need to (but not too far). |
922 | 38.4M | while (c > operator_begin + 1 && !allowed_at_end_of_operator(*(c - 1))) { |
923 | 5.75M | c--; |
924 | 5.75M | } |
925 | 32.7M | data += std::string(operator_begin, c); |
926 | 32.7M | if (data == "$") { |
927 | 114k | kind = Token::DOLLAR; |
928 | 114k | data = ""; |
929 | 32.6M | } else { |
930 | 32.6M | kind = Token::OPERATOR; |
931 | 32.6M | } |
932 | 32.7M | } else { |
933 | 321 | std::stringstream ss; |
934 | 321 | ss << "Could not lex the character "; |
935 | 321 | auto uc = (unsigned char)(*c); |
936 | 321 | if (*c < 32) |
937 | 289 | ss << "code " << unsigned(uc); |
938 | 32 | else |
939 | 32 | ss << "'" << *c << "'"; |
940 | 321 | throw StaticError(filename, begin, ss.str()); |
941 | 321 | } |
942 | 304M | } |
943 | | |
944 | | // Ensure that a bug in the above code does not cause an infinite memory consuming loop due |
945 | | // to pushing empty tokens. |
946 | 294M | if (c == original_c) { |
947 | 0 | throw StaticError(filename, begin, "internal lexing error: pointer did not advance"); |
948 | 0 | } |
949 | | |
950 | 294M | Location end(line_number, (c + 1) - line_start); |
951 | 294M | r.emplace_back(kind, |
952 | 294M | fodder, |
953 | 294M | data, |
954 | 294M | string_block_indent, |
955 | 294M | string_block_term_indent, |
956 | 294M | LocationRange(filename, begin, end)); |
957 | 294M | fodder.clear(); |
958 | 294M | fresh_line = false; |
959 | 294M | } |
960 | | |
961 | 41.0k | Location begin(line_number, c - line_start + 1); |
962 | 41.0k | Location end(line_number, (c + 1) - line_start + 1); |
963 | 41.0k | r.emplace_back(Token::END_OF_FILE, fodder, "", "", "", LocationRange(filename, begin, end)); |
964 | 41.0k | return r; |
965 | 42.5k | } |
966 | | |
967 | | std::string jsonnet_unlex(const Tokens &tokens) |
968 | 0 | { |
969 | 0 | std::stringstream ss; |
970 | 0 | for (const auto &t : tokens) { |
971 | 0 | for (const auto &f : t.fodder) { |
972 | 0 | switch (f.kind) { |
973 | 0 | case FodderElement::LINE_END: { |
974 | 0 | if (f.comment.size() > 0) { |
975 | 0 | ss << "LineEnd(" << f.blanks << ", " << f.indent << ", " << f.comment[0] |
976 | 0 | << ")\n"; |
977 | 0 | } else { |
978 | 0 | ss << "LineEnd(" << f.blanks << ", " << f.indent << ")\n"; |
979 | 0 | } |
980 | 0 | } break; |
981 | | |
982 | 0 | case FodderElement::INTERSTITIAL: { |
983 | 0 | ss << "Interstitial(" << f.comment[0] << ")\n"; |
984 | 0 | } break; |
985 | | |
986 | 0 | case FodderElement::PARAGRAPH: { |
987 | 0 | ss << "Paragraph(\n"; |
988 | 0 | for (const auto &line : f.comment) { |
989 | 0 | ss << " " << line << '\n'; |
990 | 0 | } |
991 | 0 | ss << ")" << f.blanks << "\n"; |
992 | 0 | } break; |
993 | 0 | } |
994 | 0 | } |
995 | 0 | if (t.kind == Token::END_OF_FILE) { |
996 | 0 | ss << "EOF\n"; |
997 | 0 | break; |
998 | 0 | } |
999 | 0 | if (t.kind == Token::STRING_DOUBLE) { |
1000 | 0 | ss << "\"" << t.data << "\"\n"; |
1001 | 0 | } else if (t.kind == Token::STRING_SINGLE) { |
1002 | 0 | ss << "'" << t.data << "'\n"; |
1003 | 0 | } else if (t.kind == Token::STRING_BLOCK) { |
1004 | 0 | ss << "|||\n"; |
1005 | 0 | ss << t.stringBlockIndent; |
1006 | 0 | for (const char *cp = t.data.c_str(); *cp != '\0'; ++cp) { |
1007 | 0 | ss << *cp; |
1008 | 0 | if (*cp == '\n' && *(cp + 1) != '\n' && *(cp + 1) != '\0') { |
1009 | 0 | ss << t.stringBlockIndent; |
1010 | 0 | } |
1011 | 0 | } |
1012 | 0 | ss << t.stringBlockTermIndent << "|||\n"; |
1013 | 0 | } else { |
1014 | 0 | ss << t.data << "\n"; |
1015 | 0 | } |
1016 | 0 | } |
1017 | 0 | return ss.str(); |
1018 | 0 | } |
1019 | | |
1020 | | } // namespace jsonnet::internal |