/src/cpython/Parser/lexer/lexer.c
Line | Count | Source |
1 | | #include "Python.h" |
2 | | #include "pycore_token.h" |
3 | | #include "pycore_unicodeobject.h" |
4 | | #include "errcode.h" |
5 | | |
6 | | #include "state.h" |
7 | | #include "../tokenizer/helpers.h" |
8 | | |
9 | | /* Alternate tab spacing */ |
10 | 990 | #define ALTTABSIZE 1 |
11 | | |
12 | 1.59M | #define is_potential_identifier_start(c) (\ |
13 | 1.59M | (c >= 'a' && c <= 'z')\ |
14 | 1.59M | || (c >= 'A' && c <= 'Z')\ |
15 | 1.59M | || c == '_'\ |
16 | 1.59M | || (c >= 128)) |
17 | | |
18 | 2.35M | #define is_potential_identifier_char(c) (\ |
19 | 2.35M | (c >= 'a' && c <= 'z')\ |
20 | 2.35M | || (c >= 'A' && c <= 'Z')\ |
21 | 2.35M | || (c >= '0' && c <= '9')\ |
22 | 2.35M | || c == '_'\ |
23 | 2.35M | || (c >= 128)) |
24 | | |
25 | | #ifdef Py_DEBUG |
26 | | static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) { |
27 | | assert(tok->tok_mode_stack_index >= 0); |
28 | | assert(tok->tok_mode_stack_index < MAXFSTRINGLEVEL); |
29 | | return &(tok->tok_mode_stack[tok->tok_mode_stack_index]); |
30 | | } |
31 | | static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) { |
32 | | assert(tok->tok_mode_stack_index >= 0); |
33 | | assert(tok->tok_mode_stack_index + 1 < MAXFSTRINGLEVEL); |
34 | | return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]); |
35 | | } |
36 | | #else |
37 | 1.71M | #define TOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index])) |
38 | 15.9k | #define TOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index])) |
39 | | #endif |
40 | | |
41 | | #define FTSTRING_MIDDLE(tok_mode) (tok_mode->string_kind == TSTRING ? TSTRING_MIDDLE : FSTRING_MIDDLE) |
42 | | #define FTSTRING_END(tok_mode) (tok_mode->string_kind == TSTRING ? TSTRING_END : FSTRING_END) |
43 | 40 | #define TOK_GET_STRING_PREFIX(tok) (TOK_GET_MODE(tok)->string_kind == TSTRING ? 't' : 'f') |
44 | 1.60M | #define MAKE_TOKEN(token_type) _PyLexer_token_setup(tok, token, token_type, p_start, p_end) |
45 | 0 | #define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\ |
46 | 0 | _PyLexer_type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end)) |
47 | | |
48 | | /* Spaces in this constant are treated as "zero or more spaces or tabs" when |
49 | | tokenizing. */ |
50 | | static const char* type_comment_prefix = "# type: "; |
51 | | |
52 | | static inline int |
53 | | contains_null_bytes(const char* str, size_t size) |
54 | 228k | { |
55 | 228k | return memchr(str, 0, size) != NULL; |
56 | 228k | } |
57 | | |
58 | | /* Get next char, updating state; error code goes into tok->done */ |
59 | | static int |
60 | | tok_nextc(struct tok_state *tok) |
61 | 10.3M | { |
62 | 10.3M | int rc; |
63 | 10.5M | for (;;) { |
64 | 10.5M | if (tok->cur != tok->inp) { |
65 | 10.2M | if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) { |
66 | 0 | tok->done = E_COLUMNOVERFLOW; |
67 | 0 | return EOF; |
68 | 0 | } |
69 | 10.2M | tok->col_offset++; |
70 | 10.2M | return Py_CHARMASK(*tok->cur++); /* Fast path */ |
71 | 10.2M | } |
72 | 276k | if (tok->done != E_OK) { |
73 | 32.0k | return EOF; |
74 | 32.0k | } |
75 | 244k | rc = tok->underflow(tok); |
76 | | #if defined(Py_DEBUG) |
77 | | if (tok->debug) { |
78 | | fprintf(stderr, "line[%d] = ", tok->lineno); |
79 | | _PyTokenizer_print_escape(stderr, tok->cur, tok->inp - tok->cur); |
80 | | fprintf(stderr, " tok->done = %d\n", tok->done); |
81 | | } |
82 | | #endif |
83 | 244k | if (!rc) { |
84 | 16.1k | tok->cur = tok->inp; |
85 | 16.1k | return EOF; |
86 | 16.1k | } |
87 | 228k | tok->line_start = tok->cur; |
88 | | |
89 | 228k | if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) { |
90 | 0 | _PyTokenizer_syntaxerror(tok, "source code cannot contain null bytes"); |
91 | 0 | tok->cur = tok->inp; |
92 | 0 | return EOF; |
93 | 0 | } |
94 | 228k | } |
95 | 10.3M | Py_UNREACHABLE(); |
96 | 10.3M | } |
97 | | |
98 | | /* Back-up one character */ |
99 | | static void |
100 | | tok_backup(struct tok_state *tok, int c) |
101 | 3.39M | { |
102 | 3.39M | if (c != EOF) { |
103 | 3.36M | if (--tok->cur < tok->buf) { |
104 | 0 | Py_FatalError("tokenizer beginning of buffer"); |
105 | 0 | } |
106 | 3.36M | if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) { |
107 | 0 | Py_FatalError("tok_backup: wrong character"); |
108 | 0 | } |
109 | 3.36M | tok->col_offset--; |
110 | 3.36M | } |
111 | 3.39M | } |
112 | | |
113 | | static int |
114 | 23.4k | set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { |
115 | 23.4k | assert(token != NULL); |
116 | 23.4k | assert(c == '}' || c == ':' || c == '!'); |
117 | 23.4k | tokenizer_mode *tok_mode = TOK_GET_MODE(tok); |
118 | | |
119 | 23.4k | if (!(tok_mode->in_debug || tok_mode->string_kind == TSTRING) || token->metadata) { |
120 | 13.8k | return 0; |
121 | 13.8k | } |
122 | 9.65k | PyObject *res = NULL; |
123 | | |
124 | | // Look for a # character outside of string literals |
125 | 9.65k | int hash_detected = 0; |
126 | 9.65k | int in_string = 0; |
127 | 9.65k | char quote_char = 0; |
128 | | |
129 | 1.01M | for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) { |
130 | 1.01M | char ch = tok_mode->last_expr_buffer[i]; |
131 | | |
132 | | // Skip escaped characters |
133 | 1.01M | if (ch == '\\') { |
134 | 18.4k | i++; |
135 | 18.4k | continue; |
136 | 18.4k | } |
137 | | |
138 | | // Handle quotes |
139 | 992k | if (ch == '"' || ch == '\'') { |
140 | | // The following if/else block works becase there is an off number |
141 | | // of quotes in STRING tokens and the lexer only ever reaches this |
142 | | // function with valid STRING tokens. |
143 | | // For example: """hello""" |
144 | | // First quote: in_string = 1 |
145 | | // Second quote: in_string = 0 |
146 | | // Third quote: in_string = 1 |
147 | 173k | if (!in_string) { |
148 | 64.3k | in_string = 1; |
149 | 64.3k | quote_char = ch; |
150 | 64.3k | } |
151 | 108k | else if (ch == quote_char) { |
152 | 63.6k | in_string = 0; |
153 | 63.6k | } |
154 | 173k | continue; |
155 | 173k | } |
156 | | |
157 | | // Check for # outside strings |
158 | 818k | if (ch == '#' && !in_string) { |
159 | 895 | hash_detected = 1; |
160 | 895 | break; |
161 | 895 | } |
162 | 818k | } |
163 | | // If we found a # character in the expression, we need to handle comments |
164 | 9.65k | if (hash_detected) { |
165 | | // Allocate buffer for processed result |
166 | 895 | char *result = (char *)PyMem_Malloc((tok_mode->last_expr_size - tok_mode->last_expr_end + 1) * sizeof(char)); |
167 | 895 | if (!result) { |
168 | 0 | return -1; |
169 | 0 | } |
170 | | |
171 | 895 | Py_ssize_t i = 0; // Input position |
172 | 895 | Py_ssize_t j = 0; // Output position |
173 | 895 | in_string = 0; // Whether we're in a string |
174 | 895 | quote_char = 0; // Current string quote char |
175 | | |
176 | | // Process each character |
177 | 63.5k | while (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { |
178 | 62.6k | char ch = tok_mode->last_expr_buffer[i]; |
179 | | |
180 | | // Handle string quotes |
181 | 62.6k | if (ch == '"' || ch == '\'') { |
182 | | // See comment above to understand this part |
183 | 9.64k | if (!in_string) { |
184 | 3.82k | in_string = 1; |
185 | 3.82k | quote_char = ch; |
186 | 5.82k | } else if (ch == quote_char) { |
187 | 3.81k | in_string = 0; |
188 | 3.81k | } |
189 | 9.64k | result[j++] = ch; |
190 | 9.64k | } |
191 | | // Skip comments |
192 | 53.0k | else if (ch == '#' && !in_string) { |
193 | 46.8k | while (i < tok_mode->last_expr_size - tok_mode->last_expr_end && |
194 | 46.1k | tok_mode->last_expr_buffer[i] != '\n') { |
195 | 45.7k | i++; |
196 | 45.7k | } |
197 | 1.11k | if (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { |
198 | 341 | result[j++] = '\n'; |
199 | 341 | } |
200 | 1.11k | } |
201 | | // Copy other chars |
202 | 51.8k | else { |
203 | 51.8k | result[j++] = ch; |
204 | 51.8k | } |
205 | 62.6k | i++; |
206 | 62.6k | } |
207 | | |
208 | 895 | result[j] = '\0'; // Null-terminate the result string |
209 | 895 | res = PyUnicode_DecodeUTF8(result, j, NULL); |
210 | 895 | PyMem_Free(result); |
211 | 8.76k | } else { |
212 | 8.76k | res = PyUnicode_DecodeUTF8( |
213 | 8.76k | tok_mode->last_expr_buffer, |
214 | 8.76k | tok_mode->last_expr_size - tok_mode->last_expr_end, |
215 | 8.76k | NULL |
216 | 8.76k | ); |
217 | 8.76k | } |
218 | | |
219 | 9.65k | if (!res) { |
220 | 0 | return -1; |
221 | 0 | } |
222 | 9.65k | token->metadata = res; |
223 | 9.65k | return 0; |
224 | 9.65k | } |
225 | | |
226 | | int |
227 | | _PyLexer_update_ftstring_expr(struct tok_state *tok, char cur) |
228 | 62.7k | { |
229 | 62.7k | assert(tok->cur != NULL); |
230 | | |
231 | 62.7k | Py_ssize_t size = strlen(tok->cur); |
232 | 62.7k | tokenizer_mode *tok_mode = TOK_GET_MODE(tok); |
233 | | |
234 | 62.7k | switch (cur) { |
235 | 0 | case 0: |
236 | 0 | if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) { |
237 | 0 | return 1; |
238 | 0 | } |
239 | 0 | char *new_buffer = PyMem_Realloc( |
240 | 0 | tok_mode->last_expr_buffer, |
241 | 0 | tok_mode->last_expr_size + size |
242 | 0 | ); |
243 | 0 | if (new_buffer == NULL) { |
244 | 0 | PyMem_Free(tok_mode->last_expr_buffer); |
245 | 0 | goto error; |
246 | 0 | } |
247 | 0 | tok_mode->last_expr_buffer = new_buffer; |
248 | 0 | strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size); |
249 | 0 | tok_mode->last_expr_size += size; |
250 | 0 | break; |
251 | 39.2k | case '{': |
252 | 39.2k | if (tok_mode->last_expr_buffer != NULL) { |
253 | 28.4k | PyMem_Free(tok_mode->last_expr_buffer); |
254 | 28.4k | } |
255 | 39.2k | tok_mode->last_expr_buffer = PyMem_Malloc(size); |
256 | 39.2k | if (tok_mode->last_expr_buffer == NULL) { |
257 | 0 | goto error; |
258 | 0 | } |
259 | 39.2k | tok_mode->last_expr_size = size; |
260 | 39.2k | tok_mode->last_expr_end = -1; |
261 | 39.2k | strncpy(tok_mode->last_expr_buffer, tok->cur, size); |
262 | 39.2k | break; |
263 | 18.3k | case '}': |
264 | 19.9k | case '!': |
265 | 19.9k | tok_mode->last_expr_end = strlen(tok->start); |
266 | 19.9k | break; |
267 | 3.50k | case ':': |
268 | 3.50k | if (tok_mode->last_expr_end == -1) { |
269 | 3.17k | tok_mode->last_expr_end = strlen(tok->start); |
270 | 3.17k | } |
271 | 3.50k | break; |
272 | 0 | default: |
273 | 0 | Py_UNREACHABLE(); |
274 | 62.7k | } |
275 | 62.7k | return 1; |
276 | 0 | error: |
277 | 0 | tok->done = E_NOMEM; |
278 | 0 | return 0; |
279 | 62.7k | } |
280 | | |
281 | | static int |
282 | | lookahead(struct tok_state *tok, const char *test) |
283 | 9.09k | { |
284 | 9.09k | const char *s = test; |
285 | 9.09k | int res = 0; |
286 | 23.8k | while (1) { |
287 | 23.8k | int c = tok_nextc(tok); |
288 | 23.8k | if (*s == 0) { |
289 | 9.00k | res = !is_potential_identifier_char(c); |
290 | 9.00k | } |
291 | 14.8k | else if (c == *s) { |
292 | 14.7k | s++; |
293 | 14.7k | continue; |
294 | 14.7k | } |
295 | | |
296 | 9.09k | tok_backup(tok, c); |
297 | 23.8k | while (s != test) { |
298 | 14.7k | tok_backup(tok, *--s); |
299 | 14.7k | } |
300 | 9.09k | return res; |
301 | 23.8k | } |
302 | 9.09k | } |
303 | | |
304 | | static int |
305 | 93.6k | verify_end_of_number(struct tok_state *tok, int c, const char *kind) { |
306 | 93.6k | if (tok->tok_extra_tokens) { |
307 | | // When we are parsing extra tokens, we don't want to emit warnings |
308 | | // about invalid literals, because we want to be a bit more liberal. |
309 | 0 | return 1; |
310 | 0 | } |
311 | | /* Emit a deprecation warning only if the numeric literal is immediately |
312 | | * followed by one of keywords which can occur after a numeric literal |
313 | | * in valid code: "and", "else", "for", "if", "in", "is" and "or". |
314 | | * It allows to gradually deprecate existing valid code without adding |
315 | | * warning before error in most cases of invalid numeric literal (which |
316 | | * would be confusing and break existing tests). |
317 | | * Raise a syntax error with slightly better message than plain |
318 | | * "invalid syntax" if the numeric literal is immediately followed by |
319 | | * other keyword or identifier. |
320 | | */ |
321 | 93.6k | int r = 0; |
322 | 93.6k | if (c == 'a') { |
323 | 1.16k | r = lookahead(tok, "nd"); |
324 | 1.16k | } |
325 | 92.4k | else if (c == 'e') { |
326 | 513 | r = lookahead(tok, "lse"); |
327 | 513 | } |
328 | 91.9k | else if (c == 'f') { |
329 | 3.35k | r = lookahead(tok, "or"); |
330 | 3.35k | } |
331 | 88.5k | else if (c == 'i') { |
332 | 1.50k | int c2 = tok_nextc(tok); |
333 | 1.50k | if (c2 == 'f' || c2 == 'n' || c2 == 's') { |
334 | 1.48k | r = 1; |
335 | 1.48k | } |
336 | 1.50k | tok_backup(tok, c2); |
337 | 1.50k | } |
338 | 87.0k | else if (c == 'o') { |
339 | 3.75k | r = lookahead(tok, "r"); |
340 | 3.75k | } |
341 | 83.3k | else if (c == 'n') { |
342 | 306 | r = lookahead(tok, "ot"); |
343 | 306 | } |
344 | 93.6k | if (r) { |
345 | 10.4k | tok_backup(tok, c); |
346 | 10.4k | if (_PyTokenizer_parser_warn(tok, PyExc_SyntaxWarning, |
347 | 10.4k | "invalid %s literal", kind)) |
348 | 0 | { |
349 | 0 | return 0; |
350 | 0 | } |
351 | 10.4k | tok_nextc(tok); |
352 | 10.4k | } |
353 | 83.1k | else /* In future releases, only error will remain. */ |
354 | 83.1k | if (c < 128 && is_potential_identifier_char(c)) { |
355 | 194 | tok_backup(tok, c); |
356 | 194 | _PyTokenizer_syntaxerror(tok, "invalid %s literal", kind); |
357 | 194 | return 0; |
358 | 194 | } |
359 | 93.4k | return 1; |
360 | 93.6k | } |
361 | | |
362 | | /* Verify that the identifier follows PEP 3131. */ |
363 | | static int |
364 | | verify_identifier(struct tok_state *tok) |
365 | 11.9k | { |
366 | 11.9k | if (tok->tok_extra_tokens) { |
367 | 0 | return 1; |
368 | 0 | } |
369 | 11.9k | PyObject *s; |
370 | 11.9k | if (tok->decoding_erred) |
371 | 0 | return 0; |
372 | 11.9k | s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); |
373 | 11.9k | if (s == NULL) { |
374 | 1 | if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { |
375 | 1 | tok->done = E_DECODE; |
376 | 1 | } |
377 | 0 | else { |
378 | 0 | tok->done = E_ERROR; |
379 | 0 | } |
380 | 1 | return 0; |
381 | 1 | } |
382 | 11.9k | Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s); |
383 | 11.9k | assert(invalid >= 0); |
384 | 11.9k | assert(PyUnicode_GET_LENGTH(s) > 0); |
385 | 11.9k | if (invalid < PyUnicode_GET_LENGTH(s)) { |
386 | 711 | Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid); |
387 | 711 | if (invalid + 1 < PyUnicode_GET_LENGTH(s)) { |
388 | | /* Determine the offset in UTF-8 encoded input */ |
389 | 480 | Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1)); |
390 | 480 | if (s != NULL) { |
391 | 480 | Py_SETREF(s, PyUnicode_AsUTF8String(s)); |
392 | 480 | } |
393 | 480 | if (s == NULL) { |
394 | 0 | tok->done = E_ERROR; |
395 | 0 | return 0; |
396 | 0 | } |
397 | 480 | tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s); |
398 | 480 | } |
399 | 711 | Py_DECREF(s); |
400 | 711 | if (Py_UNICODE_ISPRINTABLE(ch)) { |
401 | 370 | _PyTokenizer_syntaxerror(tok, "invalid character '%c' (U+%04X)", ch, ch); |
402 | 370 | } |
403 | 341 | else { |
404 | 341 | _PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", ch); |
405 | 341 | } |
406 | 711 | return 0; |
407 | 711 | } |
408 | 11.2k | Py_DECREF(s); |
409 | 11.2k | return 1; |
410 | 11.9k | } |
411 | | |
412 | | static int |
413 | | tok_decimal_tail(struct tok_state *tok) |
414 | 75.3k | { |
415 | 75.3k | int c; |
416 | | |
417 | 75.8k | while (1) { |
418 | 218k | do { |
419 | 218k | c = tok_nextc(tok); |
420 | 218k | } while (Py_ISDIGIT(c)); |
421 | 75.8k | if (c != '_') { |
422 | 75.3k | break; |
423 | 75.3k | } |
424 | 534 | c = tok_nextc(tok); |
425 | 534 | if (!Py_ISDIGIT(c)) { |
426 | 12 | tok_backup(tok, c); |
427 | 12 | _PyTokenizer_syntaxerror(tok, "invalid decimal literal"); |
428 | 12 | return 0; |
429 | 12 | } |
430 | 534 | } |
431 | 75.3k | return c; |
432 | 75.3k | } |
433 | | |
434 | | static inline int |
435 | 1.09k | tok_continuation_line(struct tok_state *tok) { |
436 | 1.09k | int c = tok_nextc(tok); |
437 | 1.09k | if (c == '\r') { |
438 | 70 | c = tok_nextc(tok); |
439 | 70 | } |
440 | 1.09k | if (c != '\n') { |
441 | 51 | tok->done = E_LINECONT; |
442 | 51 | return -1; |
443 | 51 | } |
444 | 1.04k | c = tok_nextc(tok); |
445 | 1.04k | if (c == EOF) { |
446 | 55 | tok->done = E_EOF; |
447 | 55 | tok->cur = tok->inp; |
448 | 55 | return -1; |
449 | 987 | } else { |
450 | 987 | tok_backup(tok, c); |
451 | 987 | } |
452 | 987 | return c; |
453 | 1.04k | } |
454 | | |
455 | | static int |
456 | | maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok, |
457 | | int saw_b, int saw_r, int saw_u, |
458 | 20.0k | int saw_f, int saw_t) { |
459 | | // Supported: rb, rf, rt (in any order) |
460 | | // Unsupported: ub, ur, uf, ut, bf, bt, ft (in any order) |
461 | | |
462 | 20.0k | #define RETURN_SYNTAX_ERROR(PREFIX1, PREFIX2) \ |
463 | 20.0k | do { \ |
464 | 7 | (void)_PyTokenizer_syntaxerror_known_range( \ |
465 | 7 | tok, (int)(tok->start + 1 - tok->line_start), \ |
466 | 7 | (int)(tok->cur - tok->line_start), \ |
467 | 7 | "'" PREFIX1 "' and '" PREFIX2 "' prefixes are incompatible"); \ |
468 | 7 | return -1; \ |
469 | 7 | } while (0) |
470 | | |
471 | 20.0k | if (saw_u && saw_b) { |
472 | 1 | RETURN_SYNTAX_ERROR("u", "b"); |
473 | 1 | } |
474 | 20.0k | if (saw_u && saw_r) { |
475 | 1 | RETURN_SYNTAX_ERROR("u", "r"); |
476 | 1 | } |
477 | 20.0k | if (saw_u && saw_f) { |
478 | 1 | RETURN_SYNTAX_ERROR("u", "f"); |
479 | 1 | } |
480 | 20.0k | if (saw_u && saw_t) { |
481 | 1 | RETURN_SYNTAX_ERROR("u", "t"); |
482 | 1 | } |
483 | | |
484 | 20.0k | if (saw_b && saw_f) { |
485 | 1 | RETURN_SYNTAX_ERROR("b", "f"); |
486 | 1 | } |
487 | 20.0k | if (saw_b && saw_t) { |
488 | 1 | RETURN_SYNTAX_ERROR("b", "t"); |
489 | 1 | } |
490 | | |
491 | 20.0k | if (saw_f && saw_t) { |
492 | 1 | RETURN_SYNTAX_ERROR("f", "t"); |
493 | 1 | } |
494 | | |
495 | 19.9k | #undef RETURN_SYNTAX_ERROR |
496 | | |
497 | 19.9k | return 0; |
498 | 20.0k | } |
499 | | |
500 | | static int |
501 | | tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token) |
502 | 1.56M | { |
503 | 1.56M | int c; |
504 | 1.56M | int blankline, nonascii; |
505 | | |
506 | 1.56M | const char *p_start = NULL; |
507 | 1.56M | const char *p_end = NULL; |
508 | 1.65M | nextline: |
509 | 1.65M | tok->start = NULL; |
510 | 1.65M | tok->starting_col_offset = -1; |
511 | 1.65M | blankline = 0; |
512 | | |
513 | | |
514 | | /* Get indentation level */ |
515 | 1.65M | if (tok->atbol) { |
516 | 225k | int col = 0; |
517 | 225k | int altcol = 0; |
518 | 225k | tok->atbol = 0; |
519 | 225k | int cont_line_col = 0; |
520 | 961k | for (;;) { |
521 | 961k | c = tok_nextc(tok); |
522 | 961k | if (c == ' ') { |
523 | 733k | col++, altcol++; |
524 | 733k | } |
525 | 227k | else if (c == '\t') { |
526 | 495 | col = (col / tok->tabsize + 1) * tok->tabsize; |
527 | 495 | altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE; |
528 | 495 | } |
529 | 227k | else if (c == '\014') {/* Control-L (formfeed) */ |
530 | 1.22k | col = altcol = 0; /* For Emacs users */ |
531 | 1.22k | } |
532 | 226k | else if (c == '\\') { |
533 | | // Indentation cannot be split over multiple physical lines |
534 | | // using backslashes. This means that if we found a backslash |
535 | | // preceded by whitespace, **the first one we find** determines |
536 | | // the level of indentation of whatever comes next. |
537 | 655 | cont_line_col = cont_line_col ? cont_line_col : col; |
538 | 655 | if ((c = tok_continuation_line(tok)) == -1) { |
539 | 42 | return MAKE_TOKEN(ERRORTOKEN); |
540 | 42 | } |
541 | 655 | } |
542 | 225k | else if (c == EOF && PyErr_Occurred()) { |
543 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
544 | 0 | } |
545 | 225k | else { |
546 | 225k | break; |
547 | 225k | } |
548 | 961k | } |
549 | 225k | tok_backup(tok, c); |
550 | 225k | if (c == '#' || c == '\n' || c == '\r') { |
551 | | /* Lines with only whitespace and/or comments |
552 | | shouldn't affect the indentation and are |
553 | | not passed to the parser as NEWLINE tokens, |
554 | | except *totally* empty lines in interactive |
555 | | mode, which signal the end of a command group. */ |
556 | 53.3k | if (col == 0 && c == '\n' && tok->prompt != NULL) { |
557 | 0 | blankline = 0; /* Let it through */ |
558 | 0 | } |
559 | 53.3k | else if (tok->prompt != NULL && tok->lineno == 1) { |
560 | | /* In interactive mode, if the first line contains |
561 | | only spaces and/or a comment, let it through. */ |
562 | 0 | blankline = 0; |
563 | 0 | col = altcol = 0; |
564 | 0 | } |
565 | 53.3k | else { |
566 | 53.3k | blankline = 1; /* Ignore completely */ |
567 | 53.3k | } |
568 | | /* We can't jump back right here since we still |
569 | | may need to skip to the end of a comment */ |
570 | 53.3k | } |
571 | 225k | if (!blankline && tok->level == 0) { |
572 | 130k | col = cont_line_col ? cont_line_col : col; |
573 | 130k | altcol = cont_line_col ? cont_line_col : altcol; |
574 | 130k | if (col == tok->indstack[tok->indent]) { |
575 | | /* No change */ |
576 | 90.1k | if (altcol != tok->altindstack[tok->indent]) { |
577 | 1 | return MAKE_TOKEN(_PyTokenizer_indenterror(tok)); |
578 | 1 | } |
579 | 90.1k | } |
580 | 40.6k | else if (col > tok->indstack[tok->indent]) { |
581 | | /* Indent -- always one */ |
582 | 22.7k | if (tok->indent+1 >= MAXINDENT) { |
583 | 0 | tok->done = E_TOODEEP; |
584 | 0 | tok->cur = tok->inp; |
585 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
586 | 0 | } |
587 | 22.7k | if (altcol <= tok->altindstack[tok->indent]) { |
588 | 2 | return MAKE_TOKEN(_PyTokenizer_indenterror(tok)); |
589 | 2 | } |
590 | 22.7k | tok->pendin++; |
591 | 22.7k | tok->indstack[++tok->indent] = col; |
592 | 22.7k | tok->altindstack[tok->indent] = altcol; |
593 | 22.7k | } |
594 | 17.8k | else /* col < tok->indstack[tok->indent] */ { |
595 | | /* Dedent -- any number, must be consistent */ |
596 | 39.9k | while (tok->indent > 0 && |
597 | 35.1k | col < tok->indstack[tok->indent]) { |
598 | 22.0k | tok->pendin--; |
599 | 22.0k | tok->indent--; |
600 | 22.0k | } |
601 | 17.8k | if (col != tok->indstack[tok->indent]) { |
602 | 6 | tok->done = E_DEDENT; |
603 | 6 | tok->cur = tok->inp; |
604 | 6 | return MAKE_TOKEN(ERRORTOKEN); |
605 | 6 | } |
606 | 17.8k | if (altcol != tok->altindstack[tok->indent]) { |
607 | 1 | return MAKE_TOKEN(_PyTokenizer_indenterror(tok)); |
608 | 1 | } |
609 | 17.8k | } |
610 | 130k | } |
611 | 225k | } |
612 | | |
613 | 1.65M | tok->start = tok->cur; |
614 | 1.65M | tok->starting_col_offset = tok->col_offset; |
615 | | |
616 | | /* Return pending indents/dedents */ |
617 | 1.65M | if (tok->pendin != 0) { |
618 | 44.7k | if (tok->pendin < 0) { |
619 | 22.0k | if (tok->tok_extra_tokens) { |
620 | 0 | p_start = tok->cur; |
621 | 0 | p_end = tok->cur; |
622 | 0 | } |
623 | 22.0k | tok->pendin++; |
624 | 22.0k | return MAKE_TOKEN(DEDENT); |
625 | 22.0k | } |
626 | 22.7k | else { |
627 | 22.7k | if (tok->tok_extra_tokens) { |
628 | 0 | p_start = tok->buf; |
629 | 0 | p_end = tok->cur; |
630 | 0 | } |
631 | 22.7k | tok->pendin--; |
632 | 22.7k | return MAKE_TOKEN(INDENT); |
633 | 22.7k | } |
634 | 44.7k | } |
635 | | |
636 | | /* Peek ahead at the next character */ |
637 | 1.61M | c = tok_nextc(tok); |
638 | 1.61M | tok_backup(tok, c); |
639 | | |
640 | 1.61M | again: |
641 | 1.61M | tok->start = NULL; |
642 | | /* Skip spaces */ |
643 | 1.96M | do { |
644 | 1.96M | c = tok_nextc(tok); |
645 | 1.96M | } while (c == ' ' || c == '\t' || c == '\014'); |
646 | | |
647 | | /* Set start of current token */ |
648 | 1.61M | tok->start = tok->cur == NULL ? NULL : tok->cur - 1; |
649 | 1.61M | tok->starting_col_offset = tok->col_offset - 1; |
650 | | |
651 | | /* Skip comment, unless it's a type comment */ |
652 | 1.61M | if (c == '#') { |
653 | | |
654 | 39.0k | const char* p = NULL; |
655 | 39.0k | const char *prefix, *type_start; |
656 | 39.0k | int current_starting_col_offset; |
657 | | |
658 | 1.29M | while (c != EOF && c != '\n' && c != '\r') { |
659 | 1.25M | c = tok_nextc(tok); |
660 | 1.25M | } |
661 | | |
662 | 39.0k | if (tok->tok_extra_tokens) { |
663 | 0 | p = tok->start; |
664 | 0 | } |
665 | | |
666 | 39.0k | if (tok->type_comments) { |
667 | 0 | p = tok->start; |
668 | 0 | current_starting_col_offset = tok->starting_col_offset; |
669 | 0 | prefix = type_comment_prefix; |
670 | 0 | while (*prefix && p < tok->cur) { |
671 | 0 | if (*prefix == ' ') { |
672 | 0 | while (*p == ' ' || *p == '\t') { |
673 | 0 | p++; |
674 | 0 | current_starting_col_offset++; |
675 | 0 | } |
676 | 0 | } else if (*prefix == *p) { |
677 | 0 | p++; |
678 | 0 | current_starting_col_offset++; |
679 | 0 | } else { |
680 | 0 | break; |
681 | 0 | } |
682 | | |
683 | 0 | prefix++; |
684 | 0 | } |
685 | | |
686 | | /* This is a type comment if we matched all of type_comment_prefix. */ |
687 | 0 | if (!*prefix) { |
688 | 0 | int is_type_ignore = 1; |
689 | | // +6 in order to skip the word 'ignore' |
690 | 0 | const char *ignore_end = p + 6; |
691 | 0 | const int ignore_end_col_offset = current_starting_col_offset + 6; |
692 | 0 | tok_backup(tok, c); /* don't eat the newline or EOF */ |
693 | |
|
694 | 0 | type_start = p; |
695 | | |
696 | | /* A TYPE_IGNORE is "type: ignore" followed by the end of the token |
697 | | * or anything ASCII and non-alphanumeric. */ |
698 | 0 | is_type_ignore = ( |
699 | 0 | tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0 |
700 | 0 | && !(tok->cur > ignore_end |
701 | 0 | && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0])))); |
702 | |
|
703 | 0 | if (is_type_ignore) { |
704 | 0 | p_start = ignore_end; |
705 | 0 | p_end = tok->cur; |
706 | | |
707 | | /* If this type ignore is the only thing on the line, consume the newline also. */ |
708 | 0 | if (blankline) { |
709 | 0 | tok_nextc(tok); |
710 | 0 | tok->atbol = 1; |
711 | 0 | } |
712 | 0 | return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset); |
713 | 0 | } else { |
714 | 0 | p_start = type_start; |
715 | 0 | p_end = tok->cur; |
716 | 0 | return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset); |
717 | 0 | } |
718 | 0 | } |
719 | 0 | } |
720 | 39.0k | if (tok->tok_extra_tokens) { |
721 | 0 | tok_backup(tok, c); /* don't eat the newline or EOF */ |
722 | 0 | p_start = p; |
723 | 0 | p_end = tok->cur; |
724 | 0 | tok->comment_newline = blankline; |
725 | 0 | return MAKE_TOKEN(COMMENT); |
726 | 0 | } |
727 | 39.0k | } |
728 | | |
729 | 1.61M | if (tok->done == E_INTERACT_STOP) { |
730 | 0 | return MAKE_TOKEN(ENDMARKER); |
731 | 0 | } |
732 | | |
733 | | /* Check for EOF and errors now */ |
734 | 1.61M | if (c == EOF) { |
735 | 16.0k | if (tok->level) { |
736 | 4.08k | return MAKE_TOKEN(ERRORTOKEN); |
737 | 4.08k | } |
738 | 11.9k | return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN); |
739 | 16.0k | } |
740 | | |
741 | | /* Identifier (most frequent token!) */ |
742 | 1.59M | nonascii = 0; |
743 | 1.59M | if (is_potential_identifier_start(c)) { |
744 | | /* Process the various legal combinations of b"", r"", u"", and f"". */ |
745 | 518k | int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0, saw_t = 0; |
746 | 639k | while (1) { |
747 | 639k | if (!saw_b && (c == 'b' || c == 'B')) { |
748 | 20.9k | saw_b = 1; |
749 | 20.9k | } |
750 | | /* Since this is a backwards compatibility support literal we don't |
751 | | want to support it in arbitrary order like byte literals. */ |
752 | 619k | else if (!saw_u && (c == 'u'|| c == 'U')) { |
753 | 7.03k | saw_u = 1; |
754 | 7.03k | } |
755 | | /* ur"" and ru"" are not supported */ |
756 | 612k | else if (!saw_r && (c == 'r' || c == 'R')) { |
757 | 35.9k | saw_r = 1; |
758 | 35.9k | } |
759 | 576k | else if (!saw_f && (c == 'f' || c == 'F')) { |
760 | 44.3k | saw_f = 1; |
761 | 44.3k | } |
762 | 531k | else if (!saw_t && (c == 't' || c == 'T')) { |
763 | 33.4k | saw_t = 1; |
764 | 33.4k | } |
765 | 498k | else { |
766 | 498k | break; |
767 | 498k | } |
768 | 141k | c = tok_nextc(tok); |
769 | 141k | if (c == '"' || c == '\'') { |
770 | | // Raise error on incompatible string prefixes: |
771 | 20.0k | int status = maybe_raise_syntax_error_for_string_prefixes( |
772 | 20.0k | tok, saw_b, saw_r, saw_u, saw_f, saw_t); |
773 | 20.0k | if (status < 0) { |
774 | 7 | return MAKE_TOKEN(ERRORTOKEN); |
775 | 7 | } |
776 | | |
777 | | // Handle valid f or t string creation: |
778 | 19.9k | if (saw_f || saw_t) { |
779 | 15.9k | goto f_string_quote; |
780 | 15.9k | } |
781 | 4.04k | goto letter_quote; |
782 | 19.9k | } |
783 | 141k | } |
784 | 2.26M | while (is_potential_identifier_char(c)) { |
785 | 1.76M | if (c >= 128) { |
786 | 109k | nonascii = 1; |
787 | 109k | } |
788 | 1.76M | c = tok_nextc(tok); |
789 | 1.76M | } |
790 | 498k | tok_backup(tok, c); |
791 | 498k | if (nonascii && !verify_identifier(tok)) { |
792 | 712 | return MAKE_TOKEN(ERRORTOKEN); |
793 | 712 | } |
794 | | |
795 | 497k | p_start = tok->start; |
796 | 497k | p_end = tok->cur; |
797 | | |
798 | 497k | return MAKE_TOKEN(NAME); |
799 | 498k | } |
800 | | |
801 | 1.08M | if (c == '\r') { |
802 | 416 | c = tok_nextc(tok); |
803 | 416 | } |
804 | | |
805 | | /* Newline */ |
806 | 1.08M | if (c == '\n') { |
807 | 206k | tok->atbol = 1; |
808 | 206k | if (blankline || tok->level > 0) { |
809 | 94.5k | if (tok->tok_extra_tokens) { |
810 | 0 | if (tok->comment_newline) { |
811 | 0 | tok->comment_newline = 0; |
812 | 0 | } |
813 | 0 | p_start = tok->start; |
814 | 0 | p_end = tok->cur; |
815 | 0 | return MAKE_TOKEN(NL); |
816 | 0 | } |
817 | 94.5k | goto nextline; |
818 | 94.5k | } |
819 | 112k | if (tok->comment_newline && tok->tok_extra_tokens) { |
820 | 0 | tok->comment_newline = 0; |
821 | 0 | p_start = tok->start; |
822 | 0 | p_end = tok->cur; |
823 | 0 | return MAKE_TOKEN(NL); |
824 | 0 | } |
825 | 112k | p_start = tok->start; |
826 | 112k | p_end = tok->cur - 1; /* Leave '\n' out of the string */ |
827 | 112k | tok->cont_line = 0; |
828 | 112k | return MAKE_TOKEN(NEWLINE); |
829 | 112k | } |
830 | | |
831 | | /* Period or number starting with period? */ |
832 | 873k | if (c == '.') { |
833 | 33.8k | c = tok_nextc(tok); |
834 | 33.8k | if (Py_ISDIGIT(c)) { |
835 | 3.27k | goto fraction; |
836 | 30.5k | } else if (c == '.') { |
837 | 1.41k | c = tok_nextc(tok); |
838 | 1.41k | if (c == '.') { |
839 | 742 | p_start = tok->start; |
840 | 742 | p_end = tok->cur; |
841 | 742 | return MAKE_TOKEN(ELLIPSIS); |
842 | 742 | } |
843 | 674 | else { |
844 | 674 | tok_backup(tok, c); |
845 | 674 | } |
846 | 674 | tok_backup(tok, '.'); |
847 | 674 | } |
848 | 29.1k | else { |
849 | 29.1k | tok_backup(tok, c); |
850 | 29.1k | } |
851 | 29.8k | p_start = tok->start; |
852 | 29.8k | p_end = tok->cur; |
853 | 29.8k | return MAKE_TOKEN(DOT); |
854 | 33.8k | } |
855 | | |
856 | | /* Number */ |
857 | 839k | if (Py_ISDIGIT(c)) { |
858 | 90.4k | if (c == '0') { |
859 | | /* Hex, octal or binary -- maybe. */ |
860 | 31.7k | c = tok_nextc(tok); |
861 | 31.7k | if (c == 'x' || c == 'X') { |
862 | | /* Hex */ |
863 | 15.8k | c = tok_nextc(tok); |
864 | 16.3k | do { |
865 | 16.3k | if (c == '_') { |
866 | 520 | c = tok_nextc(tok); |
867 | 520 | } |
868 | 16.3k | if (!Py_ISXDIGIT(c)) { |
869 | 20 | tok_backup(tok, c); |
870 | 20 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid hexadecimal literal")); |
871 | 20 | } |
872 | 80.6k | do { |
873 | 80.6k | c = tok_nextc(tok); |
874 | 80.6k | } while (Py_ISXDIGIT(c)); |
875 | 16.3k | } while (c == '_'); |
876 | 15.8k | if (!verify_end_of_number(tok, c, "hexadecimal")) { |
877 | 2 | return MAKE_TOKEN(ERRORTOKEN); |
878 | 2 | } |
879 | 15.8k | } |
880 | 15.8k | else if (c == 'o' || c == 'O') { |
881 | | /* Octal */ |
882 | 557 | c = tok_nextc(tok); |
883 | 868 | do { |
884 | 868 | if (c == '_') { |
885 | 317 | c = tok_nextc(tok); |
886 | 317 | } |
887 | 868 | if (c < '0' || c >= '8') { |
888 | 21 | if (Py_ISDIGIT(c)) { |
889 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
890 | 1 | "invalid digit '%c' in octal literal", c)); |
891 | 1 | } |
892 | 20 | else { |
893 | 20 | tok_backup(tok, c); |
894 | 20 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid octal literal")); |
895 | 20 | } |
896 | 21 | } |
897 | 2.05k | do { |
898 | 2.05k | c = tok_nextc(tok); |
899 | 2.05k | } while ('0' <= c && c < '8'); |
900 | 847 | } while (c == '_'); |
901 | 536 | if (Py_ISDIGIT(c)) { |
902 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
903 | 1 | "invalid digit '%c' in octal literal", c)); |
904 | 1 | } |
905 | 535 | if (!verify_end_of_number(tok, c, "octal")) { |
906 | 6 | return MAKE_TOKEN(ERRORTOKEN); |
907 | 6 | } |
908 | 535 | } |
909 | 15.3k | else if (c == 'b' || c == 'B') { |
910 | | /* Binary */ |
911 | 570 | c = tok_nextc(tok); |
912 | 1.00k | do { |
913 | 1.00k | if (c == '_') { |
914 | 444 | c = tok_nextc(tok); |
915 | 444 | } |
916 | 1.00k | if (c != '0' && c != '1') { |
917 | 19 | if (Py_ISDIGIT(c)) { |
918 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid digit '%c' in binary literal", c)); |
919 | 1 | } |
920 | 18 | else { |
921 | 18 | tok_backup(tok, c); |
922 | 18 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid binary literal")); |
923 | 18 | } |
924 | 19 | } |
925 | 4.04k | do { |
926 | 4.04k | c = tok_nextc(tok); |
927 | 4.04k | } while (c == '0' || c == '1'); |
928 | 985 | } while (c == '_'); |
929 | 551 | if (Py_ISDIGIT(c)) { |
930 | 2 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid digit '%c' in binary literal", c)); |
931 | 2 | } |
932 | 549 | if (!verify_end_of_number(tok, c, "binary")) { |
933 | 1 | return MAKE_TOKEN(ERRORTOKEN); |
934 | 1 | } |
935 | 549 | } |
936 | 14.7k | else { |
937 | 14.7k | int nonzero = 0; |
938 | | /* maybe old-style octal; c is first char of it */ |
939 | | /* in any case, allow '0' as a literal */ |
940 | 16.9k | while (1) { |
941 | 16.9k | if (c == '_') { |
942 | 90 | c = tok_nextc(tok); |
943 | 90 | if (!Py_ISDIGIT(c)) { |
944 | 3 | tok_backup(tok, c); |
945 | 3 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal")); |
946 | 3 | } |
947 | 90 | } |
948 | 16.9k | if (c != '0') { |
949 | 14.7k | break; |
950 | 14.7k | } |
951 | 2.19k | c = tok_nextc(tok); |
952 | 2.19k | } |
953 | 14.7k | char* zeros_end = tok->cur; |
954 | 14.7k | if (Py_ISDIGIT(c)) { |
955 | 410 | nonzero = 1; |
956 | 410 | c = tok_decimal_tail(tok); |
957 | 410 | if (c == 0) { |
958 | 1 | return MAKE_TOKEN(ERRORTOKEN); |
959 | 1 | } |
960 | 410 | } |
961 | 14.7k | if (c == '.') { |
962 | 920 | c = tok_nextc(tok); |
963 | 920 | goto fraction; |
964 | 920 | } |
965 | 13.8k | else if (c == 'e' || c == 'E') { |
966 | 849 | goto exponent; |
967 | 849 | } |
968 | 12.9k | else if (c == 'j' || c == 'J') { |
969 | 767 | goto imaginary; |
970 | 767 | } |
971 | 12.2k | else if (nonzero && !tok->tok_extra_tokens) { |
972 | | /* Old-style octal: now disallowed. */ |
973 | 28 | tok_backup(tok, c); |
974 | 28 | return MAKE_TOKEN(_PyTokenizer_syntaxerror_known_range( |
975 | 28 | tok, (int)(tok->start + 1 - tok->line_start), |
976 | 28 | (int)(zeros_end - tok->line_start), |
977 | 28 | "leading zeros in decimal integer " |
978 | 28 | "literals are not permitted; " |
979 | 28 | "use an 0o prefix for octal integers")); |
980 | 28 | } |
981 | 12.1k | if (!verify_end_of_number(tok, c, "decimal")) { |
982 | 28 | return MAKE_TOKEN(ERRORTOKEN); |
983 | 28 | } |
984 | 12.1k | } |
985 | 31.7k | } |
986 | 58.7k | else { |
987 | | /* Decimal */ |
988 | 58.7k | c = tok_decimal_tail(tok); |
989 | 58.7k | if (c == 0) { |
990 | 9 | return MAKE_TOKEN(ERRORTOKEN); |
991 | 9 | } |
992 | 58.7k | { |
993 | | /* Accept floating-point numbers. */ |
994 | 58.7k | if (c == '.') { |
995 | 3.93k | c = tok_nextc(tok); |
996 | 8.13k | fraction: |
997 | | /* Fraction */ |
998 | 8.13k | if (Py_ISDIGIT(c)) { |
999 | 6.07k | c = tok_decimal_tail(tok); |
1000 | 6.07k | if (c == 0) { |
1001 | 1 | return MAKE_TOKEN(ERRORTOKEN); |
1002 | 1 | } |
1003 | 6.07k | } |
1004 | 8.13k | } |
1005 | 62.8k | if (c == 'e' || c == 'E') { |
1006 | 9.84k | int e; |
1007 | 10.6k | exponent: |
1008 | 10.6k | e = c; |
1009 | | /* Exponent part */ |
1010 | 10.6k | c = tok_nextc(tok); |
1011 | 10.6k | if (c == '+' || c == '-') { |
1012 | 3.80k | c = tok_nextc(tok); |
1013 | 3.80k | if (!Py_ISDIGIT(c)) { |
1014 | 10 | tok_backup(tok, c); |
1015 | 10 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal")); |
1016 | 10 | } |
1017 | 6.89k | } else if (!Py_ISDIGIT(c)) { |
1018 | 514 | tok_backup(tok, c); |
1019 | 514 | if (!verify_end_of_number(tok, e, "decimal")) { |
1020 | 33 | return MAKE_TOKEN(ERRORTOKEN); |
1021 | 33 | } |
1022 | 481 | tok_backup(tok, e); |
1023 | 481 | p_start = tok->start; |
1024 | 481 | p_end = tok->cur; |
1025 | 481 | return MAKE_TOKEN(NUMBER); |
1026 | 514 | } |
1027 | 10.1k | c = tok_decimal_tail(tok); |
1028 | 10.1k | if (c == 0) { |
1029 | 1 | return MAKE_TOKEN(ERRORTOKEN); |
1030 | 1 | } |
1031 | 10.1k | } |
1032 | 63.2k | if (c == 'j' || c == 'J') { |
1033 | | /* Imaginary part */ |
1034 | 3.97k | imaginary: |
1035 | 3.97k | c = tok_nextc(tok); |
1036 | 3.97k | if (!verify_end_of_number(tok, c, "imaginary")) { |
1037 | 13 | return MAKE_TOKEN(ERRORTOKEN); |
1038 | 13 | } |
1039 | 3.97k | } |
1040 | 60.0k | else if (!verify_end_of_number(tok, c, "decimal")) { |
1041 | 111 | return MAKE_TOKEN(ERRORTOKEN); |
1042 | 111 | } |
1043 | 63.2k | } |
1044 | 63.2k | } |
1045 | 92.9k | tok_backup(tok, c); |
1046 | 92.9k | p_start = tok->start; |
1047 | 92.9k | p_end = tok->cur; |
1048 | 92.9k | return MAKE_TOKEN(NUMBER); |
1049 | 90.4k | } |
1050 | | |
1051 | 765k | f_string_quote: |
1052 | 765k | if (((Py_TOLOWER(*tok->start) == 'f' || Py_TOLOWER(*tok->start) == 'r' || Py_TOLOWER(*tok->start) == 't') |
1053 | 15.9k | && (c == '\'' || c == '"'))) { |
1054 | | |
1055 | 15.9k | int quote = c; |
1056 | 15.9k | int quote_size = 1; /* 1 or 3 */ |
1057 | | |
1058 | | /* Nodes of type STRING, especially multi line strings |
1059 | | must be handled differently in order to get both |
1060 | | the starting line number and the column offset right. |
1061 | | (cf. issue 16806) */ |
1062 | 15.9k | tok->first_lineno = tok->lineno; |
1063 | 15.9k | tok->multi_line_start = tok->line_start; |
1064 | | |
1065 | | /* Find the quote size and start of string */ |
1066 | 15.9k | int after_quote = tok_nextc(tok); |
1067 | 15.9k | if (after_quote == quote) { |
1068 | 2.45k | int after_after_quote = tok_nextc(tok); |
1069 | 2.45k | if (after_after_quote == quote) { |
1070 | 752 | quote_size = 3; |
1071 | 752 | } |
1072 | 1.70k | else { |
1073 | | // TODO: Check this |
1074 | 1.70k | tok_backup(tok, after_after_quote); |
1075 | 1.70k | tok_backup(tok, after_quote); |
1076 | 1.70k | } |
1077 | 2.45k | } |
1078 | 15.9k | if (after_quote != quote) { |
1079 | 13.5k | tok_backup(tok, after_quote); |
1080 | 13.5k | } |
1081 | | |
1082 | | |
1083 | 15.9k | p_start = tok->start; |
1084 | 15.9k | p_end = tok->cur; |
1085 | 15.9k | if (tok->tok_mode_stack_index + 1 >= MAXFSTRINGLEVEL) { |
1086 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "too many nested f-strings or t-strings")); |
1087 | 1 | } |
1088 | 15.9k | tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok); |
1089 | 15.9k | the_current_tok->kind = TOK_FSTRING_MODE; |
1090 | 15.9k | the_current_tok->quote = quote; |
1091 | 15.9k | the_current_tok->quote_size = quote_size; |
1092 | 15.9k | the_current_tok->start = tok->start; |
1093 | 15.9k | the_current_tok->multi_line_start = tok->line_start; |
1094 | 15.9k | the_current_tok->first_line = tok->lineno; |
1095 | 15.9k | the_current_tok->start_offset = -1; |
1096 | 15.9k | the_current_tok->multi_line_start_offset = -1; |
1097 | 15.9k | the_current_tok->last_expr_buffer = NULL; |
1098 | 15.9k | the_current_tok->last_expr_size = 0; |
1099 | 15.9k | the_current_tok->last_expr_end = -1; |
1100 | 15.9k | the_current_tok->in_format_spec = 0; |
1101 | 15.9k | the_current_tok->in_debug = 0; |
1102 | | |
1103 | 15.9k | enum string_kind_t string_kind = FSTRING; |
1104 | 15.9k | switch (*tok->start) { |
1105 | 548 | case 'T': |
1106 | 4.26k | case 't': |
1107 | 4.26k | the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r'; |
1108 | 4.26k | string_kind = TSTRING; |
1109 | 4.26k | break; |
1110 | 1.62k | case 'F': |
1111 | 11.3k | case 'f': |
1112 | 11.3k | the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r'; |
1113 | 11.3k | break; |
1114 | 68 | case 'R': |
1115 | 370 | case 'r': |
1116 | 370 | the_current_tok->raw = 1; |
1117 | 370 | if (Py_TOLOWER(*(tok->start + 1)) == 't') { |
1118 | 97 | string_kind = TSTRING; |
1119 | 97 | } |
1120 | 370 | break; |
1121 | 0 | default: |
1122 | 0 | Py_UNREACHABLE(); |
1123 | 15.9k | } |
1124 | | |
1125 | 15.9k | the_current_tok->string_kind = string_kind; |
1126 | 15.9k | the_current_tok->curly_bracket_depth = 0; |
1127 | 15.9k | the_current_tok->curly_bracket_expr_start_depth = -1; |
1128 | 15.9k | return string_kind == TSTRING ? MAKE_TOKEN(TSTRING_START) : MAKE_TOKEN(FSTRING_START); |
1129 | 15.9k | } |
1130 | | |
1131 | 753k | letter_quote: |
1132 | | /* String */ |
1133 | 753k | if (c == '\'' || c == '"') { |
1134 | 54.8k | int quote = c; |
1135 | 54.8k | int quote_size = 1; /* 1 or 3 */ |
1136 | 54.8k | int end_quote_size = 0; |
1137 | 54.8k | int has_escaped_quote = 0; |
1138 | | |
1139 | | /* Nodes of type STRING, especially multi line strings |
1140 | | must be handled differently in order to get both |
1141 | | the starting line number and the column offset right. |
1142 | | (cf. issue 16806) */ |
1143 | 54.8k | tok->first_lineno = tok->lineno; |
1144 | 54.8k | tok->multi_line_start = tok->line_start; |
1145 | | |
1146 | | /* Find the quote size and start of string */ |
1147 | 54.8k | c = tok_nextc(tok); |
1148 | 54.8k | if (c == quote) { |
1149 | 8.97k | c = tok_nextc(tok); |
1150 | 8.97k | if (c == quote) { |
1151 | 2.78k | quote_size = 3; |
1152 | 2.78k | } |
1153 | 6.18k | else { |
1154 | 6.18k | end_quote_size = 1; /* empty string found */ |
1155 | 6.18k | } |
1156 | 8.97k | } |
1157 | 54.8k | if (c != quote) { |
1158 | 52.0k | tok_backup(tok, c); |
1159 | 52.0k | } |
1160 | | |
1161 | | /* Get rest of string */ |
1162 | 1.08M | while (end_quote_size != quote_size) { |
1163 | 1.03M | c = tok_nextc(tok); |
1164 | 1.03M | if (tok->done == E_ERROR) { |
1165 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1166 | 0 | } |
1167 | 1.03M | if (tok->done == E_DECODE) { |
1168 | 0 | break; |
1169 | 0 | } |
1170 | 1.03M | if (c == EOF || (quote_size == 1 && c == '\n')) { |
1171 | 315 | assert(tok->multi_line_start != NULL); |
1172 | | // shift the tok_state's location into |
1173 | | // the start of string, and report the error |
1174 | | // from the initial quote character |
1175 | 315 | tok->cur = (char *)tok->start; |
1176 | 315 | tok->cur++; |
1177 | 315 | tok->line_start = tok->multi_line_start; |
1178 | 315 | int start = tok->lineno; |
1179 | 315 | tok->lineno = tok->first_lineno; |
1180 | | |
1181 | 315 | if (INSIDE_FSTRING(tok)) { |
1182 | | /* When we are in an f-string, before raising the |
1183 | | * unterminated string literal error, check whether |
1184 | | * does the initial quote matches with f-strings quotes |
1185 | | * and if it is, then this must be a missing '}' token |
1186 | | * so raise the proper error */ |
1187 | 30 | tokenizer_mode *the_current_tok = TOK_GET_MODE(tok); |
1188 | 30 | if (the_current_tok->quote == quote && |
1189 | 24 | the_current_tok->quote_size == quote_size) { |
1190 | 19 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1191 | 19 | "%c-string: expecting '}'", TOK_GET_STRING_PREFIX(tok))); |
1192 | 19 | } |
1193 | 30 | } |
1194 | | |
1195 | 296 | if (quote_size == 3) { |
1196 | 17 | _PyTokenizer_syntaxerror(tok, "unterminated triple-quoted string literal" |
1197 | 17 | " (detected at line %d)", start); |
1198 | 17 | if (c != '\n') { |
1199 | 17 | tok->done = E_EOFS; |
1200 | 17 | } |
1201 | 17 | return MAKE_TOKEN(ERRORTOKEN); |
1202 | 17 | } |
1203 | 279 | else { |
1204 | 279 | if (has_escaped_quote) { |
1205 | 10 | _PyTokenizer_syntaxerror( |
1206 | 10 | tok, |
1207 | 10 | "unterminated string literal (detected at line %d); " |
1208 | 10 | "perhaps you escaped the end quote?", |
1209 | 10 | start |
1210 | 10 | ); |
1211 | 269 | } else { |
1212 | 269 | _PyTokenizer_syntaxerror( |
1213 | 269 | tok, "unterminated string literal (detected at line %d)", start |
1214 | 269 | ); |
1215 | 269 | } |
1216 | 279 | if (c != '\n') { |
1217 | 14 | tok->done = E_EOLS; |
1218 | 14 | } |
1219 | 279 | return MAKE_TOKEN(ERRORTOKEN); |
1220 | 279 | } |
1221 | 296 | } |
1222 | 1.03M | if (c == quote) { |
1223 | 55.4k | end_quote_size += 1; |
1224 | 55.4k | } |
1225 | 976k | else { |
1226 | 976k | end_quote_size = 0; |
1227 | 976k | if (c == '\\') { |
1228 | 26.8k | c = tok_nextc(tok); /* skip escaped char */ |
1229 | 26.8k | if (c == quote) { /* but record whether the escaped char was a quote */ |
1230 | 953 | has_escaped_quote = 1; |
1231 | 953 | } |
1232 | 26.8k | if (c == '\r') { |
1233 | 67 | c = tok_nextc(tok); |
1234 | 67 | } |
1235 | 26.8k | } |
1236 | 976k | } |
1237 | 1.03M | } |
1238 | | |
1239 | 54.5k | p_start = tok->start; |
1240 | 54.5k | p_end = tok->cur; |
1241 | 54.5k | return MAKE_TOKEN(STRING); |
1242 | 54.8k | } |
1243 | | |
1244 | | /* Line continuation */ |
1245 | 698k | if (c == '\\') { |
1246 | 438 | if ((c = tok_continuation_line(tok)) == -1) { |
1247 | 64 | return MAKE_TOKEN(ERRORTOKEN); |
1248 | 64 | } |
1249 | 374 | tok->cont_line = 1; |
1250 | 374 | goto again; /* Read next line */ |
1251 | 438 | } |
1252 | | |
1253 | | /* Punctuation character */ |
1254 | 698k | int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{'); |
1255 | 698k | if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) { |
1256 | | /* This code block gets executed before the curly_bracket_depth is incremented |
1257 | | * by the `{` case, so for ensuring that we are on the 0th level, we need |
1258 | | * to adjust it manually */ |
1259 | 54.4k | int cursor = current_tok->curly_bracket_depth - (c != '{'); |
1260 | 54.4k | int in_format_spec = current_tok->in_format_spec; |
1261 | 54.4k | int cursor_in_format_with_debug = |
1262 | 54.4k | cursor == 1 && (current_tok->in_debug || in_format_spec); |
1263 | 54.4k | int cursor_valid = cursor == 0 || cursor_in_format_with_debug; |
1264 | 54.4k | if ((cursor_valid) && !_PyLexer_update_ftstring_expr(tok, c)) { |
1265 | 0 | return MAKE_TOKEN(ENDMARKER); |
1266 | 0 | } |
1267 | 54.4k | if ((cursor_valid) && c != '{' && set_ftstring_expr(tok, token, c)) { |
1268 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1269 | 0 | } |
1270 | | |
1271 | 54.4k | if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) { |
1272 | 4.69k | current_tok->kind = TOK_FSTRING_MODE; |
1273 | 4.69k | current_tok->in_format_spec = 1; |
1274 | 4.69k | p_start = tok->start; |
1275 | 4.69k | p_end = tok->cur; |
1276 | 4.69k | return MAKE_TOKEN(_PyToken_OneChar(c)); |
1277 | 4.69k | } |
1278 | 54.4k | } |
1279 | | |
1280 | | /* Check for two-character token */ |
1281 | 693k | { |
1282 | 693k | int c2 = tok_nextc(tok); |
1283 | 693k | int current_token = _PyToken_TwoChars(c, c2); |
1284 | 693k | if (current_token != OP) { |
1285 | 23.5k | int c3 = tok_nextc(tok); |
1286 | 23.5k | int current_token3 = _PyToken_ThreeChars(c, c2, c3); |
1287 | 23.5k | if (current_token3 != OP) { |
1288 | 938 | current_token = current_token3; |
1289 | 938 | } |
1290 | 22.5k | else { |
1291 | 22.5k | tok_backup(tok, c3); |
1292 | 22.5k | } |
1293 | 23.5k | p_start = tok->start; |
1294 | 23.5k | p_end = tok->cur; |
1295 | 23.5k | return MAKE_TOKEN(current_token); |
1296 | 23.5k | } |
1297 | 669k | tok_backup(tok, c2); |
1298 | 669k | } |
1299 | | |
1300 | | /* Keep track of parentheses nesting level */ |
1301 | 0 | switch (c) { |
1302 | 86.3k | case '(': |
1303 | 115k | case '[': |
1304 | 158k | case '{': |
1305 | 158k | if (tok->level >= MAXLEVEL) { |
1306 | 4 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "too many nested parentheses")); |
1307 | 4 | } |
1308 | 158k | tok->parenstack[tok->level] = c; |
1309 | 158k | tok->parenlinenostack[tok->level] = tok->lineno; |
1310 | 158k | tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start); |
1311 | 158k | tok->level++; |
1312 | 158k | if (INSIDE_FSTRING(tok)) { |
1313 | 28.7k | current_tok->curly_bracket_depth++; |
1314 | 28.7k | } |
1315 | 158k | break; |
1316 | 59.5k | case ')': |
1317 | 70.6k | case ']': |
1318 | 96.0k | case '}': |
1319 | 96.0k | if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') { |
1320 | 58 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1321 | 58 | "%c-string: single '}' is not allowed", TOK_GET_STRING_PREFIX(tok))); |
1322 | 58 | } |
1323 | 96.0k | if (!tok->tok_extra_tokens && !tok->level) { |
1324 | 194 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "unmatched '%c'", c)); |
1325 | 194 | } |
1326 | 95.8k | if (tok->level > 0) { |
1327 | 95.8k | tok->level--; |
1328 | 95.8k | int opening = tok->parenstack[tok->level]; |
1329 | 95.8k | if (!tok->tok_extra_tokens && !((opening == '(' && c == ')') || |
1330 | 36.3k | (opening == '[' && c == ']') || |
1331 | 25.3k | (opening == '{' && c == '}'))) { |
1332 | | /* If the opening bracket belongs to an f-string's expression |
1333 | | part (e.g. f"{)}") and the closing bracket is an arbitrary |
1334 | | nested expression, then instead of matching a different |
1335 | | syntactical construct with it; we'll throw an unmatched |
1336 | | parentheses error. */ |
1337 | 39 | if (INSIDE_FSTRING(tok) && opening == '{') { |
1338 | 7 | assert(current_tok->curly_bracket_depth >= 0); |
1339 | 7 | int previous_bracket = current_tok->curly_bracket_depth - 1; |
1340 | 7 | if (previous_bracket == current_tok->curly_bracket_expr_start_depth) { |
1341 | 5 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1342 | 5 | "%c-string: unmatched '%c'", TOK_GET_STRING_PREFIX(tok), c)); |
1343 | 5 | } |
1344 | 7 | } |
1345 | 34 | if (tok->parenlinenostack[tok->level] != tok->lineno) { |
1346 | 2 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1347 | 2 | "closing parenthesis '%c' does not match " |
1348 | 2 | "opening parenthesis '%c' on line %d", |
1349 | 2 | c, opening, tok->parenlinenostack[tok->level])); |
1350 | 2 | } |
1351 | 32 | else { |
1352 | 32 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1353 | 32 | "closing parenthesis '%c' does not match " |
1354 | 32 | "opening parenthesis '%c'", |
1355 | 32 | c, opening)); |
1356 | 32 | } |
1357 | 34 | } |
1358 | 95.8k | } |
1359 | | |
1360 | 95.8k | if (INSIDE_FSTRING(tok)) { |
1361 | 21.6k | current_tok->curly_bracket_depth--; |
1362 | 21.6k | if (current_tok->curly_bracket_depth < 0) { |
1363 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "%c-string: unmatched '%c'", |
1364 | 1 | TOK_GET_STRING_PREFIX(tok), c)); |
1365 | 1 | } |
1366 | 21.6k | if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) { |
1367 | 20.3k | current_tok->curly_bracket_expr_start_depth--; |
1368 | 20.3k | current_tok->kind = TOK_FSTRING_MODE; |
1369 | 20.3k | current_tok->in_format_spec = 0; |
1370 | 20.3k | current_tok->in_debug = 0; |
1371 | 20.3k | } |
1372 | 21.6k | } |
1373 | 95.8k | break; |
1374 | 415k | default: |
1375 | 415k | break; |
1376 | 669k | } |
1377 | | |
1378 | 669k | if (!Py_UNICODE_ISPRINTABLE(c)) { |
1379 | 427 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", c)); |
1380 | 427 | } |
1381 | | |
1382 | 669k | if( c == '=' && INSIDE_FSTRING_EXPR_AT_TOP(current_tok)) { |
1383 | 4.88k | current_tok->in_debug = 1; |
1384 | 4.88k | } |
1385 | | |
1386 | | /* Punctuation character */ |
1387 | 669k | p_start = tok->start; |
1388 | 669k | p_end = tok->cur; |
1389 | 669k | return MAKE_TOKEN(_PyToken_OneChar(c)); |
1390 | 669k | } |
1391 | | |
1392 | | static int |
1393 | | tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token) |
1394 | 51.7k | { |
1395 | 51.7k | const char *p_start = NULL; |
1396 | 51.7k | const char *p_end = NULL; |
1397 | 51.7k | int end_quote_size = 0; |
1398 | 51.7k | int unicode_escape = 0; |
1399 | | |
1400 | 51.7k | tok->start = tok->cur; |
1401 | 51.7k | tok->first_lineno = tok->lineno; |
1402 | 51.7k | tok->starting_col_offset = tok->col_offset; |
1403 | | |
1404 | | // If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize |
1405 | | // before it. |
1406 | 51.7k | int start_char = tok_nextc(tok); |
1407 | 51.7k | if (start_char == '{') { |
1408 | 14.9k | int peek1 = tok_nextc(tok); |
1409 | 14.9k | tok_backup(tok, peek1); |
1410 | 14.9k | tok_backup(tok, start_char); |
1411 | 14.9k | if (peek1 != '{') { |
1412 | 12.1k | current_tok->curly_bracket_expr_start_depth++; |
1413 | 12.1k | if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { |
1414 | 3 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1415 | 3 | "%c-string: expressions nested too deeply", TOK_GET_STRING_PREFIX(tok))); |
1416 | 3 | } |
1417 | 12.1k | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1418 | 12.1k | return tok_get_normal_mode(tok, current_tok, token); |
1419 | 12.1k | } |
1420 | 14.9k | } |
1421 | 36.8k | else { |
1422 | 36.8k | tok_backup(tok, start_char); |
1423 | 36.8k | } |
1424 | | |
1425 | | // Check if we are at the end of the string |
1426 | 56.4k | for (int i = 0; i < current_tok->quote_size; i++) { |
1427 | 44.5k | int quote = tok_nextc(tok); |
1428 | 44.5k | if (quote != current_tok->quote) { |
1429 | 27.7k | tok_backup(tok, quote); |
1430 | 27.7k | goto f_string_middle; |
1431 | 27.7k | } |
1432 | 44.5k | } |
1433 | | |
1434 | 11.8k | if (current_tok->last_expr_buffer != NULL) { |
1435 | 6.85k | PyMem_Free(current_tok->last_expr_buffer); |
1436 | 6.85k | current_tok->last_expr_buffer = NULL; |
1437 | 6.85k | current_tok->last_expr_size = 0; |
1438 | 6.85k | current_tok->last_expr_end = -1; |
1439 | 6.85k | } |
1440 | | |
1441 | 11.8k | p_start = tok->start; |
1442 | 11.8k | p_end = tok->cur; |
1443 | 11.8k | tok->tok_mode_stack_index--; |
1444 | 11.8k | return MAKE_TOKEN(FTSTRING_END(current_tok)); |
1445 | | |
1446 | 27.7k | f_string_middle: |
1447 | | |
1448 | | // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle |
1449 | | // this. |
1450 | 27.7k | tok->multi_line_start = tok->line_start; |
1451 | 170k | while (end_quote_size != current_tok->quote_size) { |
1452 | 165k | int c = tok_nextc(tok); |
1453 | 165k | if (tok->done == E_ERROR || tok->done == E_DECODE) { |
1454 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1455 | 0 | } |
1456 | 165k | int in_format_spec = ( |
1457 | 165k | current_tok->in_format_spec |
1458 | 10.9k | && |
1459 | 10.9k | INSIDE_FSTRING_EXPR(current_tok) |
1460 | 165k | ); |
1461 | | |
1462 | 165k | if (c == EOF || (current_tok->quote_size == 1 && c == '\n')) { |
1463 | 439 | if (tok->decoding_erred) { |
1464 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1465 | 0 | } |
1466 | | |
1467 | | // If we are in a format spec and we found a newline, |
1468 | | // it means that the format spec ends here and we should |
1469 | | // return to the regular mode. |
1470 | 439 | if (in_format_spec && c == '\n') { |
1471 | 49 | if (current_tok->quote_size == 1) { |
1472 | 49 | return MAKE_TOKEN( |
1473 | 49 | _PyTokenizer_syntaxerror( |
1474 | 49 | tok, |
1475 | 49 | "%c-string: newlines are not allowed in format specifiers for single quoted %c-strings", |
1476 | 49 | TOK_GET_STRING_PREFIX(tok), TOK_GET_STRING_PREFIX(tok) |
1477 | 49 | ) |
1478 | 49 | ); |
1479 | 49 | } |
1480 | 0 | tok_backup(tok, c); |
1481 | 0 | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1482 | 0 | current_tok->in_format_spec = 0; |
1483 | 0 | p_start = tok->start; |
1484 | 0 | p_end = tok->cur; |
1485 | 0 | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1486 | 49 | } |
1487 | | |
1488 | 439 | assert(tok->multi_line_start != NULL); |
1489 | | // shift the tok_state's location into |
1490 | | // the start of string, and report the error |
1491 | | // from the initial quote character |
1492 | 390 | tok->cur = (char *)current_tok->start; |
1493 | 390 | tok->cur++; |
1494 | 390 | tok->line_start = current_tok->multi_line_start; |
1495 | 390 | int start = tok->lineno; |
1496 | | |
1497 | 390 | tokenizer_mode *the_current_tok = TOK_GET_MODE(tok); |
1498 | 390 | tok->lineno = the_current_tok->first_line; |
1499 | | |
1500 | 390 | if (current_tok->quote_size == 3) { |
1501 | 40 | _PyTokenizer_syntaxerror(tok, |
1502 | 40 | "unterminated triple-quoted %c-string literal" |
1503 | 40 | " (detected at line %d)", |
1504 | 40 | TOK_GET_STRING_PREFIX(tok), start); |
1505 | 40 | if (c != '\n') { |
1506 | 40 | tok->done = E_EOFS; |
1507 | 40 | } |
1508 | 40 | return MAKE_TOKEN(ERRORTOKEN); |
1509 | 40 | } |
1510 | 350 | else { |
1511 | 350 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1512 | 350 | "unterminated %c-string literal (detected at" |
1513 | 350 | " line %d)", TOK_GET_STRING_PREFIX(tok), start)); |
1514 | 350 | } |
1515 | 390 | } |
1516 | | |
1517 | 164k | if (c == current_tok->quote) { |
1518 | 9.02k | end_quote_size += 1; |
1519 | 9.02k | continue; |
1520 | 155k | } else { |
1521 | 155k | end_quote_size = 0; |
1522 | 155k | } |
1523 | | |
1524 | 155k | if (c == '{') { |
1525 | 16.3k | if (!_PyLexer_update_ftstring_expr(tok, c)) { |
1526 | 0 | return MAKE_TOKEN(ENDMARKER); |
1527 | 0 | } |
1528 | 16.3k | int peek = tok_nextc(tok); |
1529 | 16.3k | if (peek != '{' || in_format_spec) { |
1530 | 13.1k | tok_backup(tok, peek); |
1531 | 13.1k | tok_backup(tok, c); |
1532 | 13.1k | current_tok->curly_bracket_expr_start_depth++; |
1533 | 13.1k | if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { |
1534 | 5 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1535 | 5 | "%c-string: expressions nested too deeply", TOK_GET_STRING_PREFIX(tok))); |
1536 | 5 | } |
1537 | 13.0k | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1538 | 13.0k | current_tok->in_format_spec = 0; |
1539 | 13.0k | p_start = tok->start; |
1540 | 13.0k | p_end = tok->cur; |
1541 | 13.0k | } else { |
1542 | 3.28k | p_start = tok->start; |
1543 | 3.28k | p_end = tok->cur - 1; |
1544 | 3.28k | } |
1545 | 16.3k | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1546 | 139k | } else if (c == '}') { |
1547 | 5.45k | if (unicode_escape) { |
1548 | 381 | p_start = tok->start; |
1549 | 381 | p_end = tok->cur; |
1550 | 381 | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1551 | 381 | } |
1552 | 5.07k | int peek = tok_nextc(tok); |
1553 | | |
1554 | | // The tokenizer can only be in the format spec if we have already completed the expression |
1555 | | // scanning (indicated by the end of the expression being set) and we are not at the top level |
1556 | | // of the bracket stack (-1 is the top level). Since format specifiers can't legally use double |
1557 | | // brackets, we can bypass it here. |
1558 | 5.07k | int cursor = current_tok->curly_bracket_depth; |
1559 | 5.07k | if (peek == '}' && !in_format_spec && cursor == 0) { |
1560 | 1.64k | p_start = tok->start; |
1561 | 1.64k | p_end = tok->cur - 1; |
1562 | 3.43k | } else { |
1563 | 3.43k | tok_backup(tok, peek); |
1564 | 3.43k | tok_backup(tok, c); |
1565 | 3.43k | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1566 | 3.43k | current_tok->in_format_spec = 0; |
1567 | 3.43k | p_start = tok->start; |
1568 | 3.43k | p_end = tok->cur; |
1569 | 3.43k | } |
1570 | 5.07k | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1571 | 133k | } else if (c == '\\') { |
1572 | 5.60k | int peek = tok_nextc(tok); |
1573 | 5.60k | if (peek == '\r') { |
1574 | 66 | peek = tok_nextc(tok); |
1575 | 66 | } |
1576 | | // Special case when the backslash is right before a curly |
1577 | | // brace. We have to restore and return the control back |
1578 | | // to the loop for the next iteration. |
1579 | 5.60k | if (peek == '{' || peek == '}') { |
1580 | 1.40k | if (!current_tok->raw) { |
1581 | 1.20k | if (_PyTokenizer_warn_invalid_escape_sequence(tok, peek)) { |
1582 | 1 | return MAKE_TOKEN(ERRORTOKEN); |
1583 | 1 | } |
1584 | 1.20k | } |
1585 | 1.40k | tok_backup(tok, peek); |
1586 | 1.40k | continue; |
1587 | 1.40k | } |
1588 | | |
1589 | 4.19k | if (!current_tok->raw) { |
1590 | 4.07k | if (peek == 'N') { |
1591 | | /* Handle named unicode escapes (\N{BULLET}) */ |
1592 | 483 | peek = tok_nextc(tok); |
1593 | 483 | if (peek == '{') { |
1594 | 406 | unicode_escape = 1; |
1595 | 406 | } else { |
1596 | 77 | tok_backup(tok, peek); |
1597 | 77 | } |
1598 | 483 | } |
1599 | 4.07k | } /* else { |
1600 | | skip the escaped character |
1601 | | }*/ |
1602 | 4.19k | } |
1603 | 155k | } |
1604 | | |
1605 | | // Backup the f-string quotes to emit a final FSTRING_MIDDLE and |
1606 | | // add the quotes to the FSTRING_END in the next tokenizer iteration. |
1607 | 11.8k | for (int i = 0; i < current_tok->quote_size; i++) { |
1608 | 6.37k | tok_backup(tok, current_tok->quote); |
1609 | 6.37k | } |
1610 | 5.50k | p_start = tok->start; |
1611 | 5.50k | p_end = tok->cur; |
1612 | 5.50k | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1613 | 27.7k | } |
1614 | | |
1615 | | static int |
1616 | | tok_get(struct tok_state *tok, struct token *token) |
1617 | 1.60M | { |
1618 | 1.60M | tokenizer_mode *current_tok = TOK_GET_MODE(tok); |
1619 | 1.60M | if (current_tok->kind == TOK_REGULAR_MODE) { |
1620 | 1.55M | return tok_get_normal_mode(tok, current_tok, token); |
1621 | 1.55M | } else { |
1622 | 51.7k | return tok_get_fstring_mode(tok, current_tok, token); |
1623 | 51.7k | } |
1624 | 1.60M | } |
1625 | | |
1626 | | int |
1627 | | _PyTokenizer_Get(struct tok_state *tok, struct token *token) |
1628 | 1.60M | { |
1629 | 1.60M | int result = tok_get(tok, token); |
1630 | 1.60M | if (tok->decoding_erred) { |
1631 | 0 | result = ERRORTOKEN; |
1632 | 0 | tok->done = E_DECODE; |
1633 | 0 | } |
1634 | 1.60M | return result; |
1635 | 1.60M | } |