/src/cpython/Parser/lexer/lexer.c
Line | Count | Source |
1 | | #include "Python.h" |
2 | | #include "pycore_token.h" |
3 | | #include "pycore_unicodeobject.h" |
4 | | #include "errcode.h" |
5 | | |
6 | | #include "state.h" |
7 | | #include "../tokenizer/helpers.h" |
8 | | |
9 | | /* Alternate tab spacing */ |
10 | 1.19k | #define ALTTABSIZE 1 |
11 | | |
12 | 1.19M | #define is_potential_identifier_start(c) (\ |
13 | 1.19M | (c >= 'a' && c <= 'z')\ |
14 | 1.19M | || (c >= 'A' && c <= 'Z')\ |
15 | 1.19M | || c == '_'\ |
16 | 1.19M | || (c >= 128)) |
17 | | |
18 | 1.78M | #define is_potential_identifier_char(c) (\ |
19 | 1.78M | (c >= 'a' && c <= 'z')\ |
20 | 1.78M | || (c >= 'A' && c <= 'Z')\ |
21 | 1.78M | || (c >= '0' && c <= '9')\ |
22 | 1.78M | || c == '_'\ |
23 | 1.78M | || (c >= 128)) |
24 | | |
25 | | #ifdef Py_DEBUG |
26 | | static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) { |
27 | | assert(tok->tok_mode_stack_index >= 0); |
28 | | assert(tok->tok_mode_stack_index < MAXFSTRINGLEVEL); |
29 | | return &(tok->tok_mode_stack[tok->tok_mode_stack_index]); |
30 | | } |
31 | | static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) { |
32 | | assert(tok->tok_mode_stack_index >= 0); |
33 | | assert(tok->tok_mode_stack_index + 1 < MAXFSTRINGLEVEL); |
34 | | return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]); |
35 | | } |
36 | | #else |
37 | 1.32M | #define TOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index])) |
38 | 16.1k | #define TOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index])) |
39 | | #endif |
40 | | |
41 | | #define FTSTRING_MIDDLE(tok_mode) (tok_mode->string_kind == TSTRING ? TSTRING_MIDDLE : FSTRING_MIDDLE) |
42 | | #define FTSTRING_END(tok_mode) (tok_mode->string_kind == TSTRING ? TSTRING_END : FSTRING_END) |
43 | 31 | #define TOK_GET_STRING_PREFIX(tok) (TOK_GET_MODE(tok)->string_kind == TSTRING ? 't' : 'f') |
44 | 1.19M | #define MAKE_TOKEN(token_type) _PyLexer_token_setup(tok, token, token_type, p_start, p_end) |
45 | 0 | #define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\ |
46 | 0 | _PyLexer_type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end)) |
47 | | |
48 | | /* Spaces in this constant are treated as "zero or more spaces or tabs" when |
49 | | tokenizing. */ |
50 | | static const char* type_comment_prefix = "# type: "; |
51 | | |
52 | | static inline int |
53 | | contains_null_bytes(const char* str, size_t size) |
54 | 155k | { |
55 | 155k | return memchr(str, 0, size) != NULL; |
56 | 155k | } |
57 | | |
58 | | /* Get next char, updating state; error code goes into tok->done */ |
59 | | static int |
60 | | tok_nextc(struct tok_state *tok) |
61 | 7.42M | { |
62 | 7.42M | int rc; |
63 | 7.57M | for (;;) { |
64 | 7.57M | if (tok->cur != tok->inp) { |
65 | 7.38M | if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) { |
66 | 0 | tok->done = E_COLUMNOVERFLOW; |
67 | 0 | return EOF; |
68 | 0 | } |
69 | 7.38M | tok->col_offset++; |
70 | 7.38M | return Py_CHARMASK(*tok->cur++); /* Fast path */ |
71 | 7.38M | } |
72 | 196k | if (tok->done != E_OK) { |
73 | 27.0k | return EOF; |
74 | 27.0k | } |
75 | 169k | rc = tok->underflow(tok); |
76 | | #if defined(Py_DEBUG) |
77 | | if (tok->debug) { |
78 | | fprintf(stderr, "line[%d] = ", tok->lineno); |
79 | | _PyTokenizer_print_escape(stderr, tok->cur, tok->inp - tok->cur); |
80 | | fprintf(stderr, " tok->done = %d\n", tok->done); |
81 | | } |
82 | | #endif |
83 | 169k | if (!rc) { |
84 | 13.6k | tok->cur = tok->inp; |
85 | 13.6k | return EOF; |
86 | 13.6k | } |
87 | 155k | tok->line_start = tok->cur; |
88 | | |
89 | 155k | if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) { |
90 | 0 | _PyTokenizer_syntaxerror(tok, "source code cannot contain null bytes"); |
91 | 0 | tok->cur = tok->inp; |
92 | 0 | return EOF; |
93 | 0 | } |
94 | 155k | } |
95 | 7.42M | Py_UNREACHABLE(); |
96 | 7.42M | } |
97 | | |
98 | | /* Back-up one character */ |
99 | | static void |
100 | | tok_backup(struct tok_state *tok, int c) |
101 | 2.57M | { |
102 | 2.57M | if (c != EOF) { |
103 | 2.55M | if (--tok->cur < tok->buf) { |
104 | 0 | Py_FatalError("tokenizer beginning of buffer"); |
105 | 0 | } |
106 | 2.55M | if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) { |
107 | 0 | Py_FatalError("tok_backup: wrong character"); |
108 | 0 | } |
109 | 2.55M | tok->col_offset--; |
110 | 2.55M | } |
111 | 2.57M | } |
112 | | |
113 | | static int |
114 | 25.7k | set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { |
115 | 25.7k | assert(token != NULL); |
116 | 25.7k | assert(c == '}' || c == ':' || c == '!'); |
117 | 25.7k | tokenizer_mode *tok_mode = TOK_GET_MODE(tok); |
118 | | |
119 | 25.7k | if (!(tok_mode->in_debug || tok_mode->string_kind == TSTRING) || token->metadata) { |
120 | 16.6k | return 0; |
121 | 16.6k | } |
122 | 9.12k | PyObject *res = NULL; |
123 | | |
124 | | // Look for a # character outside of string literals |
125 | 9.12k | int hash_detected = 0; |
126 | 9.12k | int in_string = 0; |
127 | 9.12k | char quote_char = 0; |
128 | | |
129 | 852k | for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) { |
130 | 844k | char ch = tok_mode->last_expr_buffer[i]; |
131 | | |
132 | | // Skip escaped characters |
133 | 844k | if (ch == '\\') { |
134 | 14.2k | i++; |
135 | 14.2k | continue; |
136 | 14.2k | } |
137 | | |
138 | | // Handle quotes |
139 | 829k | if (ch == '"' || ch == '\'') { |
140 | | // The following if/else block works becase there is an off number |
141 | | // of quotes in STRING tokens and the lexer only ever reaches this |
142 | | // function with valid STRING tokens. |
143 | | // For example: """hello""" |
144 | | // First quote: in_string = 1 |
145 | | // Second quote: in_string = 0 |
146 | | // Third quote: in_string = 1 |
147 | 155k | if (!in_string) { |
148 | 57.0k | in_string = 1; |
149 | 57.0k | quote_char = ch; |
150 | 57.0k | } |
151 | 98.0k | else if (ch == quote_char) { |
152 | 56.4k | in_string = 0; |
153 | 56.4k | } |
154 | 155k | continue; |
155 | 155k | } |
156 | | |
157 | | // Check for # outside strings |
158 | 674k | if (ch == '#' && !in_string) { |
159 | 731 | hash_detected = 1; |
160 | 731 | break; |
161 | 731 | } |
162 | 674k | } |
163 | | // If we found a # character in the expression, we need to handle comments |
164 | 9.12k | if (hash_detected) { |
165 | | // Allocate buffer for processed result |
166 | 731 | char *result = (char *)PyMem_Malloc((tok_mode->last_expr_size - tok_mode->last_expr_end + 1) * sizeof(char)); |
167 | 731 | if (!result) { |
168 | 0 | return -1; |
169 | 0 | } |
170 | | |
171 | 731 | Py_ssize_t i = 0; // Input position |
172 | 731 | Py_ssize_t j = 0; // Output position |
173 | 731 | in_string = 0; // Whether we're in a string |
174 | 731 | quote_char = 0; // Current string quote char |
175 | | |
176 | | // Process each character |
177 | 47.9k | while (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { |
178 | 47.1k | char ch = tok_mode->last_expr_buffer[i]; |
179 | | |
180 | | // Handle string quotes |
181 | 47.1k | if (ch == '"' || ch == '\'') { |
182 | | // See comment above to understand this part |
183 | 6.68k | if (!in_string) { |
184 | 2.70k | in_string = 1; |
185 | 2.70k | quote_char = ch; |
186 | 3.98k | } else if (ch == quote_char) { |
187 | 2.68k | in_string = 0; |
188 | 2.68k | } |
189 | 6.68k | result[j++] = ch; |
190 | 6.68k | } |
191 | | // Skip comments |
192 | 40.5k | else if (ch == '#' && !in_string) { |
193 | 22.4k | while (i < tok_mode->last_expr_size - tok_mode->last_expr_end && |
194 | 21.7k | tok_mode->last_expr_buffer[i] != '\n') { |
195 | 21.4k | i++; |
196 | 21.4k | } |
197 | 973 | if (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { |
198 | 333 | result[j++] = '\n'; |
199 | 333 | } |
200 | 973 | } |
201 | | // Copy other chars |
202 | 39.5k | else { |
203 | 39.5k | result[j++] = ch; |
204 | 39.5k | } |
205 | 47.1k | i++; |
206 | 47.1k | } |
207 | | |
208 | 731 | result[j] = '\0'; // Null-terminate the result string |
209 | 731 | res = PyUnicode_DecodeUTF8(result, j, NULL); |
210 | 731 | PyMem_Free(result); |
211 | 8.39k | } else { |
212 | 8.39k | res = PyUnicode_DecodeUTF8( |
213 | 8.39k | tok_mode->last_expr_buffer, |
214 | 8.39k | tok_mode->last_expr_size - tok_mode->last_expr_end, |
215 | 8.39k | NULL |
216 | 8.39k | ); |
217 | 8.39k | } |
218 | | |
219 | 9.12k | if (!res) { |
220 | 0 | return -1; |
221 | 0 | } |
222 | 9.12k | token->metadata = res; |
223 | 9.12k | return 0; |
224 | 9.12k | } |
225 | | |
226 | | int |
227 | | _PyLexer_update_ftstring_expr(struct tok_state *tok, char cur) |
228 | 69.2k | { |
229 | 69.2k | assert(tok->cur != NULL); |
230 | | |
231 | 69.2k | Py_ssize_t size = strlen(tok->cur); |
232 | 69.2k | tokenizer_mode *tok_mode = TOK_GET_MODE(tok); |
233 | | |
234 | 69.2k | switch (cur) { |
235 | 0 | case 0: |
236 | 0 | if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) { |
237 | 0 | return 1; |
238 | 0 | } |
239 | 0 | char *new_buffer = PyMem_Realloc( |
240 | 0 | tok_mode->last_expr_buffer, |
241 | 0 | tok_mode->last_expr_size + size |
242 | 0 | ); |
243 | 0 | if (new_buffer == NULL) { |
244 | 0 | PyMem_Free(tok_mode->last_expr_buffer); |
245 | 0 | goto error; |
246 | 0 | } |
247 | 0 | tok_mode->last_expr_buffer = new_buffer; |
248 | 0 | strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size); |
249 | 0 | tok_mode->last_expr_size += size; |
250 | 0 | break; |
251 | 43.4k | case '{': |
252 | 43.4k | if (tok_mode->last_expr_buffer != NULL) { |
253 | 32.2k | PyMem_Free(tok_mode->last_expr_buffer); |
254 | 32.2k | } |
255 | 43.4k | tok_mode->last_expr_buffer = PyMem_Malloc(size); |
256 | 43.4k | if (tok_mode->last_expr_buffer == NULL) { |
257 | 0 | goto error; |
258 | 0 | } |
259 | 43.4k | tok_mode->last_expr_size = size; |
260 | 43.4k | tok_mode->last_expr_end = -1; |
261 | 43.4k | strncpy(tok_mode->last_expr_buffer, tok->cur, size); |
262 | 43.4k | break; |
263 | 21.2k | case '}': |
264 | 22.8k | case '!': |
265 | 22.8k | tok_mode->last_expr_end = strlen(tok->start); |
266 | 22.8k | break; |
267 | 2.96k | case ':': |
268 | 2.96k | if (tok_mode->last_expr_end == -1) { |
269 | 2.55k | tok_mode->last_expr_end = strlen(tok->start); |
270 | 2.55k | } |
271 | 2.96k | break; |
272 | 0 | default: |
273 | 0 | Py_UNREACHABLE(); |
274 | 69.2k | } |
275 | 69.2k | return 1; |
276 | 0 | error: |
277 | 0 | tok->done = E_NOMEM; |
278 | 0 | return 0; |
279 | 69.2k | } |
280 | | |
281 | | static int |
282 | | lookahead(struct tok_state *tok, const char *test) |
283 | 7.23k | { |
284 | 7.23k | const char *s = test; |
285 | 7.23k | int res = 0; |
286 | 19.5k | while (1) { |
287 | 19.5k | int c = tok_nextc(tok); |
288 | 19.5k | if (*s == 0) { |
289 | 7.14k | res = !is_potential_identifier_char(c); |
290 | 7.14k | } |
291 | 12.4k | else if (c == *s) { |
292 | 12.3k | s++; |
293 | 12.3k | continue; |
294 | 12.3k | } |
295 | | |
296 | 7.23k | tok_backup(tok, c); |
297 | 19.5k | while (s != test) { |
298 | 12.3k | tok_backup(tok, *--s); |
299 | 12.3k | } |
300 | 7.23k | return res; |
301 | 19.5k | } |
302 | 7.23k | } |
303 | | |
304 | | static int |
305 | 72.9k | verify_end_of_number(struct tok_state *tok, int c, const char *kind) { |
306 | 72.9k | if (tok->tok_extra_tokens) { |
307 | | // When we are parsing extra tokens, we don't want to emit warnings |
308 | | // about invalid literals, because we want to be a bit more liberal. |
309 | 28 | return 1; |
310 | 28 | } |
311 | | /* Emit a deprecation warning only if the numeric literal is immediately |
312 | | * followed by one of keywords which can occur after a numeric literal |
313 | | * in valid code: "and", "else", "for", "if", "in", "is" and "or". |
314 | | * It allows to gradually deprecate existing valid code without adding |
315 | | * warning before error in most cases of invalid numeric literal (which |
316 | | * would be confusing and break existing tests). |
317 | | * Raise a syntax error with slightly better message than plain |
318 | | * "invalid syntax" if the numeric literal is immediately followed by |
319 | | * other keyword or identifier. |
320 | | */ |
321 | 72.9k | int r = 0; |
322 | 72.9k | if (c == 'a') { |
323 | 670 | r = lookahead(tok, "nd"); |
324 | 670 | } |
325 | 72.2k | else if (c == 'e') { |
326 | 480 | r = lookahead(tok, "lse"); |
327 | 480 | } |
328 | 71.7k | else if (c == 'f') { |
329 | 3.40k | r = lookahead(tok, "or"); |
330 | 3.40k | } |
331 | 68.3k | else if (c == 'i') { |
332 | 1.11k | int c2 = tok_nextc(tok); |
333 | 1.11k | if (c2 == 'f' || c2 == 'n' || c2 == 's') { |
334 | 1.10k | r = 1; |
335 | 1.10k | } |
336 | 1.11k | tok_backup(tok, c2); |
337 | 1.11k | } |
338 | 67.2k | else if (c == 'o') { |
339 | 2.41k | r = lookahead(tok, "r"); |
340 | 2.41k | } |
341 | 64.8k | else if (c == 'n') { |
342 | 268 | r = lookahead(tok, "ot"); |
343 | 268 | } |
344 | 72.9k | if (r) { |
345 | 8.23k | tok_backup(tok, c); |
346 | 8.23k | if (_PyTokenizer_parser_warn(tok, PyExc_SyntaxWarning, |
347 | 8.23k | "invalid %s literal", kind)) |
348 | 0 | { |
349 | 0 | return 0; |
350 | 0 | } |
351 | 8.23k | tok_nextc(tok); |
352 | 8.23k | } |
353 | 64.6k | else /* In future releases, only error will remain. */ |
354 | 64.6k | if (c < 128 && is_potential_identifier_char(c)) { |
355 | 183 | tok_backup(tok, c); |
356 | 183 | _PyTokenizer_syntaxerror(tok, "invalid %s literal", kind); |
357 | 183 | return 0; |
358 | 183 | } |
359 | 72.7k | return 1; |
360 | 72.9k | } |
361 | | |
362 | | /* Verify that the identifier follows PEP 3131. */ |
363 | | static int |
364 | | verify_identifier(struct tok_state *tok) |
365 | 13.3k | { |
366 | 13.3k | if (tok->tok_extra_tokens) { |
367 | 0 | return 1; |
368 | 0 | } |
369 | 13.3k | PyObject *s; |
370 | 13.3k | if (tok->decoding_erred) |
371 | 0 | return 0; |
372 | 13.3k | s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); |
373 | 13.3k | if (s == NULL) { |
374 | 2 | if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { |
375 | 2 | tok->done = E_DECODE; |
376 | 2 | } |
377 | 0 | else { |
378 | 0 | tok->done = E_ERROR; |
379 | 0 | } |
380 | 2 | return 0; |
381 | 2 | } |
382 | 13.3k | Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s); |
383 | 13.3k | assert(invalid >= 0); |
384 | 13.3k | assert(PyUnicode_GET_LENGTH(s) > 0); |
385 | 13.3k | if (invalid < PyUnicode_GET_LENGTH(s)) { |
386 | 596 | Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid); |
387 | 596 | if (invalid + 1 < PyUnicode_GET_LENGTH(s)) { |
388 | | /* Determine the offset in UTF-8 encoded input */ |
389 | 413 | Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1)); |
390 | 413 | if (s != NULL) { |
391 | 413 | Py_SETREF(s, PyUnicode_AsUTF8String(s)); |
392 | 413 | } |
393 | 413 | if (s == NULL) { |
394 | 0 | tok->done = E_ERROR; |
395 | 0 | return 0; |
396 | 0 | } |
397 | 413 | tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s); |
398 | 413 | } |
399 | 596 | Py_DECREF(s); |
400 | 596 | if (Py_UNICODE_ISPRINTABLE(ch)) { |
401 | 298 | _PyTokenizer_syntaxerror(tok, "invalid character '%c' (U+%04X)", ch, ch); |
402 | 298 | } |
403 | 298 | else { |
404 | 298 | _PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", ch); |
405 | 298 | } |
406 | 596 | return 0; |
407 | 596 | } |
408 | 12.7k | Py_DECREF(s); |
409 | 12.7k | return 1; |
410 | 13.3k | } |
411 | | |
412 | | static int |
413 | | tok_decimal_tail(struct tok_state *tok) |
414 | 55.1k | { |
415 | 55.1k | int c; |
416 | | |
417 | 55.2k | while (1) { |
418 | 175k | do { |
419 | 175k | c = tok_nextc(tok); |
420 | 175k | } while (Py_ISDIGIT(c)); |
421 | 55.2k | if (c != '_') { |
422 | 55.0k | break; |
423 | 55.0k | } |
424 | 126 | c = tok_nextc(tok); |
425 | 126 | if (!Py_ISDIGIT(c)) { |
426 | 10 | tok_backup(tok, c); |
427 | 10 | _PyTokenizer_syntaxerror(tok, "invalid decimal literal"); |
428 | 10 | return 0; |
429 | 10 | } |
430 | 126 | } |
431 | 55.0k | return c; |
432 | 55.1k | } |
433 | | |
434 | | static inline int |
435 | 656 | tok_continuation_line(struct tok_state *tok) { |
436 | 656 | int c = tok_nextc(tok); |
437 | 656 | if (c == '\r') { |
438 | 42 | c = tok_nextc(tok); |
439 | 42 | } |
440 | 656 | if (c != '\n') { |
441 | 38 | tok->done = E_LINECONT; |
442 | 38 | return -1; |
443 | 38 | } |
444 | 618 | c = tok_nextc(tok); |
445 | 618 | if (c == EOF) { |
446 | 42 | tok->done = E_EOF; |
447 | 42 | tok->cur = tok->inp; |
448 | 42 | return -1; |
449 | 576 | } else { |
450 | 576 | tok_backup(tok, c); |
451 | 576 | } |
452 | 576 | return c; |
453 | 618 | } |
454 | | |
455 | | static int |
456 | | maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok, |
457 | | int saw_b, int saw_r, int saw_u, |
458 | 19.4k | int saw_f, int saw_t) { |
459 | | // Supported: rb, rf, rt (in any order) |
460 | | // Unsupported: ub, ur, uf, ut, bf, bt, ft (in any order) |
461 | | |
462 | 19.4k | #define RETURN_SYNTAX_ERROR(PREFIX1, PREFIX2) \ |
463 | 19.4k | do { \ |
464 | 8 | (void)_PyTokenizer_syntaxerror_known_range( \ |
465 | 8 | tok, (int)(tok->start + 1 - tok->line_start), \ |
466 | 8 | (int)(tok->cur - tok->line_start), \ |
467 | 8 | "'" PREFIX1 "' and '" PREFIX2 "' prefixes are incompatible"); \ |
468 | 8 | return -1; \ |
469 | 8 | } while (0) |
470 | | |
471 | 19.4k | if (saw_u && saw_b) { |
472 | 1 | RETURN_SYNTAX_ERROR("u", "b"); |
473 | 1 | } |
474 | 19.4k | if (saw_u && saw_r) { |
475 | 1 | RETURN_SYNTAX_ERROR("u", "r"); |
476 | 1 | } |
477 | 19.4k | if (saw_u && saw_f) { |
478 | 1 | RETURN_SYNTAX_ERROR("u", "f"); |
479 | 1 | } |
480 | 19.4k | if (saw_u && saw_t) { |
481 | 1 | RETURN_SYNTAX_ERROR("u", "t"); |
482 | 1 | } |
483 | | |
484 | 19.4k | if (saw_b && saw_f) { |
485 | 1 | RETURN_SYNTAX_ERROR("b", "f"); |
486 | 1 | } |
487 | 19.4k | if (saw_b && saw_t) { |
488 | 1 | RETURN_SYNTAX_ERROR("b", "t"); |
489 | 1 | } |
490 | | |
491 | 19.4k | if (saw_f && saw_t) { |
492 | 2 | RETURN_SYNTAX_ERROR("f", "t"); |
493 | 2 | } |
494 | | |
495 | 19.4k | #undef RETURN_SYNTAX_ERROR |
496 | | |
497 | 19.4k | return 0; |
498 | 19.4k | } |
499 | | |
500 | | static int |
501 | | tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token) |
502 | 1.16M | { |
503 | 1.16M | int c; |
504 | 1.16M | int blankline, nonascii; |
505 | | |
506 | 1.16M | const char *p_start = NULL; |
507 | 1.16M | const char *p_end = NULL; |
508 | 1.23M | nextline: |
509 | 1.23M | tok->start = NULL; |
510 | 1.23M | tok->starting_col_offset = -1; |
511 | 1.23M | blankline = 0; |
512 | | |
513 | | |
514 | | /* Get indentation level */ |
515 | 1.23M | if (tok->atbol) { |
516 | 159k | int col = 0; |
517 | 159k | int altcol = 0; |
518 | 159k | tok->atbol = 0; |
519 | 159k | int cont_line_col = 0; |
520 | 526k | for (;;) { |
521 | 526k | c = tok_nextc(tok); |
522 | 526k | if (c == ' ') { |
523 | 365k | col++, altcol++; |
524 | 365k | } |
525 | 160k | else if (c == '\t') { |
526 | 598 | col = (col / tok->tabsize + 1) * tok->tabsize; |
527 | 598 | altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE; |
528 | 598 | } |
529 | 160k | else if (c == '\014') {/* Control-L (formfeed) */ |
530 | 553 | col = altcol = 0; /* For Emacs users */ |
531 | 553 | } |
532 | 159k | else if (c == '\\') { |
533 | | // Indentation cannot be split over multiple physical lines |
534 | | // using backslashes. This means that if we found a backslash |
535 | | // preceded by whitespace, **the first one we find** determines |
536 | | // the level of indentation of whatever comes next. |
537 | 478 | cont_line_col = cont_line_col ? cont_line_col : col; |
538 | 478 | if ((c = tok_continuation_line(tok)) == -1) { |
539 | 39 | return MAKE_TOKEN(ERRORTOKEN); |
540 | 39 | } |
541 | 478 | } |
542 | 159k | else if (c == EOF && PyErr_Occurred()) { |
543 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
544 | 0 | } |
545 | 159k | else { |
546 | 159k | break; |
547 | 159k | } |
548 | 526k | } |
549 | 159k | tok_backup(tok, c); |
550 | 159k | if (c == '#' || c == '\n' || c == '\r') { |
551 | | /* Lines with only whitespace and/or comments |
552 | | shouldn't affect the indentation and are |
553 | | not passed to the parser as NEWLINE tokens, |
554 | | except *totally* empty lines in interactive |
555 | | mode, which signal the end of a command group. */ |
556 | 37.1k | if (col == 0 && c == '\n' && tok->prompt != NULL) { |
557 | 0 | blankline = 0; /* Let it through */ |
558 | 0 | } |
559 | 37.1k | else if (tok->prompt != NULL && tok->lineno == 1) { |
560 | | /* In interactive mode, if the first line contains |
561 | | only spaces and/or a comment, let it through. */ |
562 | 0 | blankline = 0; |
563 | 0 | col = altcol = 0; |
564 | 0 | } |
565 | 37.1k | else { |
566 | 37.1k | blankline = 1; /* Ignore completely */ |
567 | 37.1k | } |
568 | | /* We can't jump back right here since we still |
569 | | may need to skip to the end of a comment */ |
570 | 37.1k | } |
571 | 159k | if (!blankline && tok->level == 0) { |
572 | 88.2k | col = cont_line_col ? cont_line_col : col; |
573 | 88.2k | altcol = cont_line_col ? cont_line_col : altcol; |
574 | 88.2k | if (col == tok->indstack[tok->indent]) { |
575 | | /* No change */ |
576 | 67.1k | if (altcol != tok->altindstack[tok->indent]) { |
577 | 1 | return MAKE_TOKEN(_PyTokenizer_indenterror(tok)); |
578 | 1 | } |
579 | 67.1k | } |
580 | 21.0k | else if (col > tok->indstack[tok->indent]) { |
581 | | /* Indent -- always one */ |
582 | 11.9k | if (tok->indent+1 >= MAXINDENT) { |
583 | 0 | tok->done = E_TOODEEP; |
584 | 0 | tok->cur = tok->inp; |
585 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
586 | 0 | } |
587 | 11.9k | if (altcol <= tok->altindstack[tok->indent]) { |
588 | 2 | return MAKE_TOKEN(_PyTokenizer_indenterror(tok)); |
589 | 2 | } |
590 | 11.9k | tok->pendin++; |
591 | 11.9k | tok->indstack[++tok->indent] = col; |
592 | 11.9k | tok->altindstack[tok->indent] = altcol; |
593 | 11.9k | } |
594 | 9.14k | else /* col < tok->indstack[tok->indent] */ { |
595 | | /* Dedent -- any number, must be consistent */ |
596 | 20.4k | while (tok->indent > 0 && |
597 | 17.6k | col < tok->indstack[tok->indent]) { |
598 | 11.2k | tok->pendin--; |
599 | 11.2k | tok->indent--; |
600 | 11.2k | } |
601 | 9.14k | if (col != tok->indstack[tok->indent]) { |
602 | 4 | tok->done = E_DEDENT; |
603 | 4 | tok->cur = tok->inp; |
604 | 4 | return MAKE_TOKEN(ERRORTOKEN); |
605 | 4 | } |
606 | 9.14k | if (altcol != tok->altindstack[tok->indent]) { |
607 | 1 | return MAKE_TOKEN(_PyTokenizer_indenterror(tok)); |
608 | 1 | } |
609 | 9.14k | } |
610 | 88.2k | } |
611 | 159k | } |
612 | | |
613 | 1.23M | tok->start = tok->cur; |
614 | 1.23M | tok->starting_col_offset = tok->col_offset; |
615 | | |
616 | | /* Return pending indents/dedents */ |
617 | 1.23M | if (tok->pendin != 0) { |
618 | 23.1k | if (tok->pendin < 0) { |
619 | 11.2k | if (tok->tok_extra_tokens) { |
620 | 30 | p_start = tok->cur; |
621 | 30 | p_end = tok->cur; |
622 | 30 | } |
623 | 11.2k | tok->pendin++; |
624 | 11.2k | return MAKE_TOKEN(DEDENT); |
625 | 11.2k | } |
626 | 11.9k | else { |
627 | 11.9k | if (tok->tok_extra_tokens) { |
628 | 32 | p_start = tok->buf; |
629 | 32 | p_end = tok->cur; |
630 | 32 | } |
631 | 11.9k | tok->pendin--; |
632 | 11.9k | return MAKE_TOKEN(INDENT); |
633 | 11.9k | } |
634 | 23.1k | } |
635 | | |
636 | | /* Peek ahead at the next character */ |
637 | 1.20M | c = tok_nextc(tok); |
638 | 1.20M | tok_backup(tok, c); |
639 | | |
640 | 1.20M | again: |
641 | 1.20M | tok->start = NULL; |
642 | | /* Skip spaces */ |
643 | 1.46M | do { |
644 | 1.46M | c = tok_nextc(tok); |
645 | 1.46M | } while (c == ' ' || c == '\t' || c == '\014'); |
646 | | |
647 | | /* Set start of current token */ |
648 | 1.20M | tok->start = tok->cur == NULL ? NULL : tok->cur - 1; |
649 | 1.20M | tok->starting_col_offset = tok->col_offset - 1; |
650 | | |
651 | | /* Skip comment, unless it's a type comment */ |
652 | 1.20M | if (c == '#') { |
653 | | |
654 | 31.5k | const char* p = NULL; |
655 | 31.5k | const char *prefix, *type_start; |
656 | 31.5k | int current_starting_col_offset; |
657 | | |
658 | 979k | while (c != EOF && c != '\n' && c != '\r') { |
659 | 947k | c = tok_nextc(tok); |
660 | 947k | } |
661 | | |
662 | 31.5k | if (tok->tok_extra_tokens) { |
663 | 22 | p = tok->start; |
664 | 22 | } |
665 | | |
666 | 31.5k | if (tok->type_comments) { |
667 | 0 | p = tok->start; |
668 | 0 | current_starting_col_offset = tok->starting_col_offset; |
669 | 0 | prefix = type_comment_prefix; |
670 | 0 | while (*prefix && p < tok->cur) { |
671 | 0 | if (*prefix == ' ') { |
672 | 0 | while (*p == ' ' || *p == '\t') { |
673 | 0 | p++; |
674 | 0 | current_starting_col_offset++; |
675 | 0 | } |
676 | 0 | } else if (*prefix == *p) { |
677 | 0 | p++; |
678 | 0 | current_starting_col_offset++; |
679 | 0 | } else { |
680 | 0 | break; |
681 | 0 | } |
682 | | |
683 | 0 | prefix++; |
684 | 0 | } |
685 | | |
686 | | /* This is a type comment if we matched all of type_comment_prefix. */ |
687 | 0 | if (!*prefix) { |
688 | 0 | int is_type_ignore = 1; |
689 | | // +6 in order to skip the word 'ignore' |
690 | 0 | const char *ignore_end = p + 6; |
691 | 0 | const int ignore_end_col_offset = current_starting_col_offset + 6; |
692 | 0 | tok_backup(tok, c); /* don't eat the newline or EOF */ |
693 | |
|
694 | 0 | type_start = p; |
695 | | |
696 | | /* A TYPE_IGNORE is "type: ignore" followed by the end of the token |
697 | | * or anything ASCII and non-alphanumeric. */ |
698 | 0 | is_type_ignore = ( |
699 | 0 | tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0 |
700 | 0 | && !(tok->cur > ignore_end |
701 | 0 | && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0])))); |
702 | |
|
703 | 0 | if (is_type_ignore) { |
704 | 0 | p_start = ignore_end; |
705 | 0 | p_end = tok->cur; |
706 | | |
707 | | /* If this type ignore is the only thing on the line, consume the newline also. */ |
708 | 0 | if (blankline) { |
709 | 0 | tok_nextc(tok); |
710 | 0 | tok->atbol = 1; |
711 | 0 | } |
712 | 0 | return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset); |
713 | 0 | } else { |
714 | 0 | p_start = type_start; |
715 | 0 | p_end = tok->cur; |
716 | 0 | return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset); |
717 | 0 | } |
718 | 0 | } |
719 | 0 | } |
720 | 31.5k | if (tok->tok_extra_tokens) { |
721 | 22 | tok_backup(tok, c); /* don't eat the newline or EOF */ |
722 | 22 | p_start = p; |
723 | 22 | p_end = tok->cur; |
724 | 22 | tok->comment_newline = blankline; |
725 | 22 | return MAKE_TOKEN(COMMENT); |
726 | 22 | } |
727 | 31.5k | } |
728 | | |
729 | 1.20M | if (tok->done == E_INTERACT_STOP) { |
730 | 0 | return MAKE_TOKEN(ENDMARKER); |
731 | 0 | } |
732 | | |
733 | | /* Check for EOF and errors now */ |
734 | 1.20M | if (c == EOF) { |
735 | 13.5k | if (tok->level) { |
736 | 3.39k | return MAKE_TOKEN(ERRORTOKEN); |
737 | 3.39k | } |
738 | 10.1k | return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN); |
739 | 13.5k | } |
740 | | |
741 | | /* Identifier (most frequent token!) */ |
742 | 1.19M | nonascii = 0; |
743 | 1.19M | if (is_potential_identifier_start(c)) { |
744 | | /* Process the various legal combinations of b"", r"", u"", and f"". */ |
745 | 394k | int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0, saw_t = 0; |
746 | 489k | while (1) { |
747 | 489k | if (!saw_b && (c == 'b' || c == 'B')) { |
748 | 16.0k | saw_b = 1; |
749 | 16.0k | } |
750 | | /* Since this is a backwards compatibility support literal we don't |
751 | | want to support it in arbitrary order like byte literals. */ |
752 | 473k | else if (!saw_u && (c == 'u'|| c == 'U')) { |
753 | 5.50k | saw_u = 1; |
754 | 5.50k | } |
755 | | /* ur"" and ru"" are not supported */ |
756 | 467k | else if (!saw_r && (c == 'r' || c == 'R')) { |
757 | 26.2k | saw_r = 1; |
758 | 26.2k | } |
759 | 441k | else if (!saw_f && (c == 'f' || c == 'F')) { |
760 | 37.8k | saw_f = 1; |
761 | 37.8k | } |
762 | 403k | else if (!saw_t && (c == 't' || c == 'T')) { |
763 | 28.4k | saw_t = 1; |
764 | 28.4k | } |
765 | 374k | else { |
766 | 374k | break; |
767 | 374k | } |
768 | 114k | c = tok_nextc(tok); |
769 | 114k | if (c == '"' || c == '\'') { |
770 | | // Raise error on incompatible string prefixes: |
771 | 19.4k | int status = maybe_raise_syntax_error_for_string_prefixes( |
772 | 19.4k | tok, saw_b, saw_r, saw_u, saw_f, saw_t); |
773 | 19.4k | if (status < 0) { |
774 | 8 | return MAKE_TOKEN(ERRORTOKEN); |
775 | 8 | } |
776 | | |
777 | | // Handle valid f or t string creation: |
778 | 19.4k | if (saw_f || saw_t) { |
779 | 16.1k | goto f_string_quote; |
780 | 16.1k | } |
781 | 3.23k | goto letter_quote; |
782 | 19.4k | } |
783 | 114k | } |
784 | 1.71M | while (is_potential_identifier_char(c)) { |
785 | 1.33M | if (c >= 128) { |
786 | 160k | nonascii = 1; |
787 | 160k | } |
788 | 1.33M | c = tok_nextc(tok); |
789 | 1.33M | } |
790 | 374k | tok_backup(tok, c); |
791 | 374k | if (nonascii && !verify_identifier(tok)) { |
792 | 598 | return MAKE_TOKEN(ERRORTOKEN); |
793 | 598 | } |
794 | | |
795 | 374k | p_start = tok->start; |
796 | 374k | p_end = tok->cur; |
797 | | |
798 | 374k | return MAKE_TOKEN(NAME); |
799 | 374k | } |
800 | | |
801 | 799k | if (c == '\r') { |
802 | 184 | c = tok_nextc(tok); |
803 | 184 | } |
804 | | |
805 | | /* Newline */ |
806 | 799k | if (c == '\n') { |
807 | 143k | tok->atbol = 1; |
808 | 143k | if (blankline || tok->level > 0) { |
809 | 70.8k | if (tok->tok_extra_tokens) { |
810 | 64 | if (tok->comment_newline) { |
811 | 12 | tok->comment_newline = 0; |
812 | 12 | } |
813 | 64 | p_start = tok->start; |
814 | 64 | p_end = tok->cur; |
815 | 64 | return MAKE_TOKEN(NL); |
816 | 64 | } |
817 | 70.7k | goto nextline; |
818 | 70.8k | } |
819 | 72.4k | if (tok->comment_newline && tok->tok_extra_tokens) { |
820 | 6 | tok->comment_newline = 0; |
821 | 6 | p_start = tok->start; |
822 | 6 | p_end = tok->cur; |
823 | 6 | return MAKE_TOKEN(NL); |
824 | 6 | } |
825 | 72.4k | p_start = tok->start; |
826 | 72.4k | p_end = tok->cur - 1; /* Leave '\n' out of the string */ |
827 | 72.4k | tok->cont_line = 0; |
828 | 72.4k | return MAKE_TOKEN(NEWLINE); |
829 | 72.4k | } |
830 | | |
831 | | /* Period or number starting with period? */ |
832 | 656k | if (c == '.') { |
833 | 25.9k | c = tok_nextc(tok); |
834 | 25.9k | if (Py_ISDIGIT(c)) { |
835 | 2.54k | goto fraction; |
836 | 23.3k | } else if (c == '.') { |
837 | 1.19k | c = tok_nextc(tok); |
838 | 1.19k | if (c == '.') { |
839 | 558 | p_start = tok->start; |
840 | 558 | p_end = tok->cur; |
841 | 558 | return MAKE_TOKEN(ELLIPSIS); |
842 | 558 | } |
843 | 633 | else { |
844 | 633 | tok_backup(tok, c); |
845 | 633 | } |
846 | 633 | tok_backup(tok, '.'); |
847 | 633 | } |
848 | 22.1k | else { |
849 | 22.1k | tok_backup(tok, c); |
850 | 22.1k | } |
851 | 22.8k | p_start = tok->start; |
852 | 22.8k | p_end = tok->cur; |
853 | 22.8k | return MAKE_TOKEN(DOT); |
854 | 25.9k | } |
855 | | |
856 | | /* Number */ |
857 | 630k | if (Py_ISDIGIT(c)) { |
858 | 70.5k | if (c == '0') { |
859 | | /* Hex, octal or binary -- maybe. */ |
860 | 28.3k | c = tok_nextc(tok); |
861 | 28.3k | if (c == 'x' || c == 'X') { |
862 | | /* Hex */ |
863 | 14.5k | c = tok_nextc(tok); |
864 | 14.6k | do { |
865 | 14.6k | if (c == '_') { |
866 | 73 | c = tok_nextc(tok); |
867 | 73 | } |
868 | 14.6k | if (!Py_ISXDIGIT(c)) { |
869 | 15 | tok_backup(tok, c); |
870 | 15 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid hexadecimal literal")); |
871 | 15 | } |
872 | 77.7k | do { |
873 | 77.7k | c = tok_nextc(tok); |
874 | 77.7k | } while (Py_ISXDIGIT(c)); |
875 | 14.6k | } while (c == '_'); |
876 | 14.5k | if (!verify_end_of_number(tok, c, "hexadecimal")) { |
877 | 2 | return MAKE_TOKEN(ERRORTOKEN); |
878 | 2 | } |
879 | 14.5k | } |
880 | 13.7k | else if (c == 'o' || c == 'O') { |
881 | | /* Octal */ |
882 | 450 | c = tok_nextc(tok); |
883 | 592 | do { |
884 | 592 | if (c == '_') { |
885 | 143 | c = tok_nextc(tok); |
886 | 143 | } |
887 | 592 | if (c < '0' || c >= '8') { |
888 | 19 | if (Py_ISDIGIT(c)) { |
889 | 2 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
890 | 2 | "invalid digit '%c' in octal literal", c)); |
891 | 2 | } |
892 | 17 | else { |
893 | 17 | tok_backup(tok, c); |
894 | 17 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid octal literal")); |
895 | 17 | } |
896 | 19 | } |
897 | 1.31k | do { |
898 | 1.31k | c = tok_nextc(tok); |
899 | 1.31k | } while ('0' <= c && c < '8'); |
900 | 573 | } while (c == '_'); |
901 | 431 | if (Py_ISDIGIT(c)) { |
902 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
903 | 1 | "invalid digit '%c' in octal literal", c)); |
904 | 1 | } |
905 | 430 | if (!verify_end_of_number(tok, c, "octal")) { |
906 | 3 | return MAKE_TOKEN(ERRORTOKEN); |
907 | 3 | } |
908 | 430 | } |
909 | 13.3k | else if (c == 'b' || c == 'B') { |
910 | | /* Binary */ |
911 | 351 | c = tok_nextc(tok); |
912 | 644 | do { |
913 | 644 | if (c == '_') { |
914 | 302 | c = tok_nextc(tok); |
915 | 302 | } |
916 | 644 | if (c != '0' && c != '1') { |
917 | 23 | if (Py_ISDIGIT(c)) { |
918 | 2 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid digit '%c' in binary literal", c)); |
919 | 2 | } |
920 | 21 | else { |
921 | 21 | tok_backup(tok, c); |
922 | 21 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid binary literal")); |
923 | 21 | } |
924 | 23 | } |
925 | 3.09k | do { |
926 | 3.09k | c = tok_nextc(tok); |
927 | 3.09k | } while (c == '0' || c == '1'); |
928 | 621 | } while (c == '_'); |
929 | 328 | if (Py_ISDIGIT(c)) { |
930 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid digit '%c' in binary literal", c)); |
931 | 1 | } |
932 | 327 | if (!verify_end_of_number(tok, c, "binary")) { |
933 | 1 | return MAKE_TOKEN(ERRORTOKEN); |
934 | 1 | } |
935 | 327 | } |
936 | 12.9k | else { |
937 | 12.9k | int nonzero = 0; |
938 | | /* maybe old-style octal; c is first char of it */ |
939 | | /* in any case, allow '0' as a literal */ |
940 | 14.4k | while (1) { |
941 | 14.4k | if (c == '_') { |
942 | 105 | c = tok_nextc(tok); |
943 | 105 | if (!Py_ISDIGIT(c)) { |
944 | 3 | tok_backup(tok, c); |
945 | 3 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal")); |
946 | 3 | } |
947 | 105 | } |
948 | 14.4k | if (c != '0') { |
949 | 12.9k | break; |
950 | 12.9k | } |
951 | 1.51k | c = tok_nextc(tok); |
952 | 1.51k | } |
953 | 12.9k | char* zeros_end = tok->cur; |
954 | 12.9k | if (Py_ISDIGIT(c)) { |
955 | 395 | nonzero = 1; |
956 | 395 | c = tok_decimal_tail(tok); |
957 | 395 | if (c == 0) { |
958 | 1 | return MAKE_TOKEN(ERRORTOKEN); |
959 | 1 | } |
960 | 395 | } |
961 | 12.9k | if (c == '.') { |
962 | 742 | c = tok_nextc(tok); |
963 | 742 | goto fraction; |
964 | 742 | } |
965 | 12.2k | else if (c == 'e' || c == 'E') { |
966 | 923 | goto exponent; |
967 | 923 | } |
968 | 11.3k | else if (c == 'j' || c == 'J') { |
969 | 960 | goto imaginary; |
970 | 960 | } |
971 | 10.3k | else if (nonzero && !tok->tok_extra_tokens) { |
972 | | /* Old-style octal: now disallowed. */ |
973 | 29 | tok_backup(tok, c); |
974 | 29 | return MAKE_TOKEN(_PyTokenizer_syntaxerror_known_range( |
975 | 29 | tok, (int)(tok->start + 1 - tok->line_start), |
976 | 29 | (int)(zeros_end - tok->line_start), |
977 | 29 | "leading zeros in decimal integer " |
978 | 29 | "literals are not permitted; " |
979 | 29 | "use an 0o prefix for octal integers")); |
980 | 29 | } |
981 | 10.3k | if (!verify_end_of_number(tok, c, "decimal")) { |
982 | 23 | return MAKE_TOKEN(ERRORTOKEN); |
983 | 23 | } |
984 | 10.3k | } |
985 | 28.3k | } |
986 | 42.1k | else { |
987 | | /* Decimal */ |
988 | 42.1k | c = tok_decimal_tail(tok); |
989 | 42.1k | if (c == 0) { |
990 | 8 | return MAKE_TOKEN(ERRORTOKEN); |
991 | 8 | } |
992 | 42.1k | { |
993 | | /* Accept floating-point numbers. */ |
994 | 42.1k | if (c == '.') { |
995 | 2.58k | c = tok_nextc(tok); |
996 | 5.87k | fraction: |
997 | | /* Fraction */ |
998 | 5.87k | if (Py_ISDIGIT(c)) { |
999 | 4.65k | c = tok_decimal_tail(tok); |
1000 | 4.65k | if (c == 0) { |
1001 | 1 | return MAKE_TOKEN(ERRORTOKEN); |
1002 | 1 | } |
1003 | 4.65k | } |
1004 | 5.87k | } |
1005 | 45.4k | if (c == 'e' || c == 'E') { |
1006 | 7.46k | int e; |
1007 | 8.38k | exponent: |
1008 | 8.38k | e = c; |
1009 | | /* Exponent part */ |
1010 | 8.38k | c = tok_nextc(tok); |
1011 | 8.38k | if (c == '+' || c == '-') { |
1012 | 2.94k | c = tok_nextc(tok); |
1013 | 2.94k | if (!Py_ISDIGIT(c)) { |
1014 | 12 | tok_backup(tok, c); |
1015 | 12 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal")); |
1016 | 12 | } |
1017 | 5.44k | } else if (!Py_ISDIGIT(c)) { |
1018 | 480 | tok_backup(tok, c); |
1019 | 480 | if (!verify_end_of_number(tok, e, "decimal")) { |
1020 | 39 | return MAKE_TOKEN(ERRORTOKEN); |
1021 | 39 | } |
1022 | 441 | tok_backup(tok, e); |
1023 | 441 | p_start = tok->start; |
1024 | 441 | p_end = tok->cur; |
1025 | 441 | return MAKE_TOKEN(NUMBER); |
1026 | 480 | } |
1027 | 7.89k | c = tok_decimal_tail(tok); |
1028 | 7.89k | if (c == 0) { |
1029 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1030 | 0 | } |
1031 | 7.89k | } |
1032 | 45.8k | if (c == 'j' || c == 'J') { |
1033 | | /* Imaginary part */ |
1034 | 3.72k | imaginary: |
1035 | 3.72k | c = tok_nextc(tok); |
1036 | 3.72k | if (!verify_end_of_number(tok, c, "imaginary")) { |
1037 | 6 | return MAKE_TOKEN(ERRORTOKEN); |
1038 | 6 | } |
1039 | 3.72k | } |
1040 | 43.1k | else if (!verify_end_of_number(tok, c, "decimal")) { |
1041 | 109 | return MAKE_TOKEN(ERRORTOKEN); |
1042 | 109 | } |
1043 | 45.8k | } |
1044 | 45.8k | } |
1045 | 72.3k | tok_backup(tok, c); |
1046 | 72.3k | p_start = tok->start; |
1047 | 72.3k | p_end = tok->cur; |
1048 | 72.3k | return MAKE_TOKEN(NUMBER); |
1049 | 70.5k | } |
1050 | | |
1051 | 576k | f_string_quote: |
1052 | 576k | if (((Py_TOLOWER(*tok->start) == 'f' || Py_TOLOWER(*tok->start) == 'r' || Py_TOLOWER(*tok->start) == 't') |
1053 | 16.1k | && (c == '\'' || c == '"'))) { |
1054 | | |
1055 | 16.1k | int quote = c; |
1056 | 16.1k | int quote_size = 1; /* 1 or 3 */ |
1057 | | |
1058 | | /* Nodes of type STRING, especially multi line strings |
1059 | | must be handled differently in order to get both |
1060 | | the starting line number and the column offset right. |
1061 | | (cf. issue 16806) */ |
1062 | 16.1k | tok->first_lineno = tok->lineno; |
1063 | 16.1k | tok->multi_line_start = tok->line_start; |
1064 | | |
1065 | | /* Find the quote size and start of string */ |
1066 | 16.1k | int after_quote = tok_nextc(tok); |
1067 | 16.1k | if (after_quote == quote) { |
1068 | 2.40k | int after_after_quote = tok_nextc(tok); |
1069 | 2.40k | if (after_after_quote == quote) { |
1070 | 503 | quote_size = 3; |
1071 | 503 | } |
1072 | 1.90k | else { |
1073 | | // TODO: Check this |
1074 | 1.90k | tok_backup(tok, after_after_quote); |
1075 | 1.90k | tok_backup(tok, after_quote); |
1076 | 1.90k | } |
1077 | 2.40k | } |
1078 | 16.1k | if (after_quote != quote) { |
1079 | 13.7k | tok_backup(tok, after_quote); |
1080 | 13.7k | } |
1081 | | |
1082 | | |
1083 | 16.1k | p_start = tok->start; |
1084 | 16.1k | p_end = tok->cur; |
1085 | 16.1k | if (tok->tok_mode_stack_index + 1 >= MAXFSTRINGLEVEL) { |
1086 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "too many nested f-strings or t-strings")); |
1087 | 1 | } |
1088 | 16.1k | tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok); |
1089 | 16.1k | the_current_tok->kind = TOK_FSTRING_MODE; |
1090 | 16.1k | the_current_tok->quote = quote; |
1091 | 16.1k | the_current_tok->quote_size = quote_size; |
1092 | 16.1k | the_current_tok->start = tok->start; |
1093 | 16.1k | the_current_tok->multi_line_start = tok->line_start; |
1094 | 16.1k | the_current_tok->first_line = tok->lineno; |
1095 | 16.1k | the_current_tok->start_offset = -1; |
1096 | 16.1k | the_current_tok->multi_line_start_offset = -1; |
1097 | 16.1k | the_current_tok->last_expr_buffer = NULL; |
1098 | 16.1k | the_current_tok->last_expr_size = 0; |
1099 | 16.1k | the_current_tok->last_expr_end = -1; |
1100 | 16.1k | the_current_tok->in_format_spec = 0; |
1101 | 16.1k | the_current_tok->in_debug = 0; |
1102 | | |
1103 | 16.1k | enum string_kind_t string_kind = FSTRING; |
1104 | 16.1k | switch (*tok->start) { |
1105 | 764 | case 'T': |
1106 | 4.27k | case 't': |
1107 | 4.27k | the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r'; |
1108 | 4.27k | string_kind = TSTRING; |
1109 | 4.27k | break; |
1110 | 1.52k | case 'F': |
1111 | 11.5k | case 'f': |
1112 | 11.5k | the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r'; |
1113 | 11.5k | break; |
1114 | 211 | case 'R': |
1115 | 358 | case 'r': |
1116 | 358 | the_current_tok->raw = 1; |
1117 | 358 | if (Py_TOLOWER(*(tok->start + 1)) == 't') { |
1118 | 42 | string_kind = TSTRING; |
1119 | 42 | } |
1120 | 358 | break; |
1121 | 0 | default: |
1122 | 0 | Py_UNREACHABLE(); |
1123 | 16.1k | } |
1124 | | |
1125 | 16.1k | the_current_tok->string_kind = string_kind; |
1126 | 16.1k | the_current_tok->curly_bracket_depth = 0; |
1127 | 16.1k | the_current_tok->curly_bracket_expr_start_depth = -1; |
1128 | 16.1k | return string_kind == TSTRING ? MAKE_TOKEN(TSTRING_START) : MAKE_TOKEN(FSTRING_START); |
1129 | 16.1k | } |
1130 | | |
1131 | 563k | letter_quote: |
1132 | | /* String */ |
1133 | 563k | if (c == '\'' || c == '"') { |
1134 | 36.6k | int quote = c; |
1135 | 36.6k | int quote_size = 1; /* 1 or 3 */ |
1136 | 36.6k | int end_quote_size = 0; |
1137 | 36.6k | int has_escaped_quote = 0; |
1138 | | |
1139 | | /* Nodes of type STRING, especially multi line strings |
1140 | | must be handled differently in order to get both |
1141 | | the starting line number and the column offset right. |
1142 | | (cf. issue 16806) */ |
1143 | 36.6k | tok->first_lineno = tok->lineno; |
1144 | 36.6k | tok->multi_line_start = tok->line_start; |
1145 | | |
1146 | | /* Find the quote size and start of string */ |
1147 | 36.6k | c = tok_nextc(tok); |
1148 | 36.6k | if (c == quote) { |
1149 | 6.20k | c = tok_nextc(tok); |
1150 | 6.20k | if (c == quote) { |
1151 | 1.22k | quote_size = 3; |
1152 | 1.22k | } |
1153 | 4.98k | else { |
1154 | 4.98k | end_quote_size = 1; /* empty string found */ |
1155 | 4.98k | } |
1156 | 6.20k | } |
1157 | 36.6k | if (c != quote) { |
1158 | 35.3k | tok_backup(tok, c); |
1159 | 35.3k | } |
1160 | | |
1161 | | /* Get rest of string */ |
1162 | 541k | while (end_quote_size != quote_size) { |
1163 | 505k | c = tok_nextc(tok); |
1164 | 505k | if (tok->done == E_ERROR) { |
1165 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1166 | 0 | } |
1167 | 505k | if (tok->done == E_DECODE) { |
1168 | 0 | break; |
1169 | 0 | } |
1170 | 505k | if (c == EOF || (quote_size == 1 && c == '\n')) { |
1171 | 280 | assert(tok->multi_line_start != NULL); |
1172 | | // shift the tok_state's location into |
1173 | | // the start of string, and report the error |
1174 | | // from the initial quote character |
1175 | 280 | tok->cur = (char *)tok->start; |
1176 | 280 | tok->cur++; |
1177 | 280 | tok->line_start = tok->multi_line_start; |
1178 | 280 | int start = tok->lineno; |
1179 | 280 | tok->lineno = tok->first_lineno; |
1180 | | |
1181 | 280 | if (INSIDE_FSTRING(tok)) { |
1182 | | /* When we are in an f-string, before raising the |
1183 | | * unterminated string literal error, check whether |
1184 | | * does the initial quote matches with f-strings quotes |
1185 | | * and if it is, then this must be a missing '}' token |
1186 | | * so raise the proper error */ |
1187 | 32 | tokenizer_mode *the_current_tok = TOK_GET_MODE(tok); |
1188 | 32 | if (the_current_tok->quote == quote && |
1189 | 25 | the_current_tok->quote_size == quote_size) { |
1190 | 18 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1191 | 18 | "%c-string: expecting '}'", TOK_GET_STRING_PREFIX(tok))); |
1192 | 18 | } |
1193 | 32 | } |
1194 | | |
1195 | 262 | if (quote_size == 3) { |
1196 | 29 | _PyTokenizer_syntaxerror(tok, "unterminated triple-quoted string literal" |
1197 | 29 | " (detected at line %d)", start); |
1198 | 29 | if (c != '\n') { |
1199 | 29 | tok->done = E_EOFS; |
1200 | 29 | } |
1201 | 29 | return MAKE_TOKEN(ERRORTOKEN); |
1202 | 29 | } |
1203 | 233 | else { |
1204 | 233 | if (has_escaped_quote) { |
1205 | 9 | _PyTokenizer_syntaxerror( |
1206 | 9 | tok, |
1207 | 9 | "unterminated string literal (detected at line %d); " |
1208 | 9 | "perhaps you escaped the end quote?", |
1209 | 9 | start |
1210 | 9 | ); |
1211 | 224 | } else { |
1212 | 224 | _PyTokenizer_syntaxerror( |
1213 | 224 | tok, "unterminated string literal (detected at line %d)", start |
1214 | 224 | ); |
1215 | 224 | } |
1216 | 233 | if (c != '\n') { |
1217 | 16 | tok->done = E_EOLS; |
1218 | 16 | } |
1219 | 233 | return MAKE_TOKEN(ERRORTOKEN); |
1220 | 233 | } |
1221 | 262 | } |
1222 | 504k | if (c == quote) { |
1223 | 34.9k | end_quote_size += 1; |
1224 | 34.9k | } |
1225 | 469k | else { |
1226 | 469k | end_quote_size = 0; |
1227 | 469k | if (c == '\\') { |
1228 | 25.2k | c = tok_nextc(tok); /* skip escaped char */ |
1229 | 25.2k | if (c == quote) { /* but record whether the escaped char was a quote */ |
1230 | 1.25k | has_escaped_quote = 1; |
1231 | 1.25k | } |
1232 | 25.2k | if (c == '\r') { |
1233 | 21 | c = tok_nextc(tok); |
1234 | 21 | } |
1235 | 25.2k | } |
1236 | 469k | } |
1237 | 504k | } |
1238 | | |
1239 | 36.3k | p_start = tok->start; |
1240 | 36.3k | p_end = tok->cur; |
1241 | 36.3k | return MAKE_TOKEN(STRING); |
1242 | 36.6k | } |
1243 | | |
1244 | | /* Line continuation */ |
1245 | 526k | if (c == '\\') { |
1246 | 178 | if ((c = tok_continuation_line(tok)) == -1) { |
1247 | 41 | return MAKE_TOKEN(ERRORTOKEN); |
1248 | 41 | } |
1249 | 137 | tok->cont_line = 1; |
1250 | 137 | goto again; /* Read next line */ |
1251 | 178 | } |
1252 | | |
1253 | | /* Punctuation character */ |
1254 | 526k | int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{'); |
1255 | 526k | if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) { |
1256 | | /* This code block gets executed before the curly_bracket_depth is incremented |
1257 | | * by the `{` case, so for ensuring that we are on the 0th level, we need |
1258 | | * to adjust it manually */ |
1259 | 58.2k | int cursor = current_tok->curly_bracket_depth - (c != '{'); |
1260 | 58.2k | int in_format_spec = current_tok->in_format_spec; |
1261 | 58.2k | int cursor_in_format_with_debug = |
1262 | 58.2k | cursor == 1 && (current_tok->in_debug || in_format_spec); |
1263 | 58.2k | int cursor_valid = cursor == 0 || cursor_in_format_with_debug; |
1264 | 58.2k | if ((cursor_valid) && !_PyLexer_update_ftstring_expr(tok, c)) { |
1265 | 0 | return MAKE_TOKEN(ENDMARKER); |
1266 | 0 | } |
1267 | 58.2k | if ((cursor_valid) && c != '{' && set_ftstring_expr(tok, token, c)) { |
1268 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1269 | 0 | } |
1270 | | |
1271 | 58.2k | if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) { |
1272 | 3.61k | current_tok->kind = TOK_FSTRING_MODE; |
1273 | 3.61k | current_tok->in_format_spec = 1; |
1274 | 3.61k | p_start = tok->start; |
1275 | 3.61k | p_end = tok->cur; |
1276 | 3.61k | return MAKE_TOKEN(_PyToken_OneChar(c)); |
1277 | 3.61k | } |
1278 | 58.2k | } |
1279 | | |
1280 | | /* Check for two-character token */ |
1281 | 522k | { |
1282 | 522k | int c2 = tok_nextc(tok); |
1283 | 522k | int current_token = _PyToken_TwoChars(c, c2); |
1284 | 522k | if (current_token != OP) { |
1285 | 21.1k | int c3 = tok_nextc(tok); |
1286 | 21.1k | int current_token3 = _PyToken_ThreeChars(c, c2, c3); |
1287 | 21.1k | if (current_token3 != OP) { |
1288 | 1.50k | current_token = current_token3; |
1289 | 1.50k | } |
1290 | 19.6k | else { |
1291 | 19.6k | tok_backup(tok, c3); |
1292 | 19.6k | } |
1293 | 21.1k | p_start = tok->start; |
1294 | 21.1k | p_end = tok->cur; |
1295 | 21.1k | return MAKE_TOKEN(current_token); |
1296 | 21.1k | } |
1297 | 501k | tok_backup(tok, c2); |
1298 | 501k | } |
1299 | | |
1300 | | /* Keep track of parentheses nesting level */ |
1301 | 0 | switch (c) { |
1302 | 60.8k | case '(': |
1303 | 77.9k | case '[': |
1304 | 119k | case '{': |
1305 | 119k | if (tok->level >= MAXLEVEL) { |
1306 | 18 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "too many nested parentheses")); |
1307 | 18 | } |
1308 | 119k | tok->parenstack[tok->level] = c; |
1309 | 119k | tok->parenlinenostack[tok->level] = tok->lineno; |
1310 | 119k | tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start); |
1311 | 119k | tok->level++; |
1312 | 119k | if (INSIDE_FSTRING(tok)) { |
1313 | 30.7k | current_tok->curly_bracket_depth++; |
1314 | 30.7k | } |
1315 | 119k | break; |
1316 | 36.1k | case ')': |
1317 | 42.1k | case ']': |
1318 | 69.3k | case '}': |
1319 | 69.3k | if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') { |
1320 | 53 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1321 | 53 | "%c-string: single '}' is not allowed", TOK_GET_STRING_PREFIX(tok))); |
1322 | 53 | } |
1323 | 69.2k | if (!tok->tok_extra_tokens && !tok->level) { |
1324 | 163 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "unmatched '%c'", c)); |
1325 | 163 | } |
1326 | 69.1k | if (tok->level > 0) { |
1327 | 69.1k | tok->level--; |
1328 | 69.1k | int opening = tok->parenstack[tok->level]; |
1329 | 69.1k | if (!tok->tok_extra_tokens && !((opening == '(' && c == ')') || |
1330 | 33.0k | (opening == '[' && c == ']') || |
1331 | 27.0k | (opening == '{' && c == '}'))) { |
1332 | | /* If the opening bracket belongs to an f-string's expression |
1333 | | part (e.g. f"{)}") and the closing bracket is an arbitrary |
1334 | | nested expression, then instead of matching a different |
1335 | | syntactical construct with it; we'll throw an unmatched |
1336 | | parentheses error. */ |
1337 | 35 | if (INSIDE_FSTRING(tok) && opening == '{') { |
1338 | 5 | assert(current_tok->curly_bracket_depth >= 0); |
1339 | 5 | int previous_bracket = current_tok->curly_bracket_depth - 1; |
1340 | 5 | if (previous_bracket == current_tok->curly_bracket_expr_start_depth) { |
1341 | 3 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1342 | 3 | "%c-string: unmatched '%c'", TOK_GET_STRING_PREFIX(tok), c)); |
1343 | 3 | } |
1344 | 5 | } |
1345 | 32 | if (tok->parenlinenostack[tok->level] != tok->lineno) { |
1346 | 6 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1347 | 6 | "closing parenthesis '%c' does not match " |
1348 | 6 | "opening parenthesis '%c' on line %d", |
1349 | 6 | c, opening, tok->parenlinenostack[tok->level])); |
1350 | 6 | } |
1351 | 26 | else { |
1352 | 26 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1353 | 26 | "closing parenthesis '%c' does not match " |
1354 | 26 | "opening parenthesis '%c'", |
1355 | 26 | c, opening)); |
1356 | 26 | } |
1357 | 32 | } |
1358 | 69.1k | } |
1359 | | |
1360 | 69.1k | if (INSIDE_FSTRING(tok)) { |
1361 | 24.0k | current_tok->curly_bracket_depth--; |
1362 | 24.0k | if (current_tok->curly_bracket_depth < 0) { |
1363 | 0 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "%c-string: unmatched '%c'", |
1364 | 0 | TOK_GET_STRING_PREFIX(tok), c)); |
1365 | 0 | } |
1366 | 24.0k | if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) { |
1367 | 22.7k | current_tok->curly_bracket_expr_start_depth--; |
1368 | 22.7k | current_tok->kind = TOK_FSTRING_MODE; |
1369 | 22.7k | current_tok->in_format_spec = 0; |
1370 | 22.7k | current_tok->in_debug = 0; |
1371 | 22.7k | } |
1372 | 24.0k | } |
1373 | 69.1k | break; |
1374 | 313k | default: |
1375 | 313k | break; |
1376 | 501k | } |
1377 | | |
1378 | 501k | if (!Py_UNICODE_ISPRINTABLE(c)) { |
1379 | 361 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", c)); |
1380 | 361 | } |
1381 | | |
1382 | 501k | if( c == '=' && INSIDE_FSTRING_EXPR_AT_TOP(current_tok)) { |
1383 | 5.12k | current_tok->in_debug = 1; |
1384 | 5.12k | } |
1385 | | |
1386 | | /* Punctuation character */ |
1387 | 501k | p_start = tok->start; |
1388 | 501k | p_end = tok->cur; |
1389 | 501k | return MAKE_TOKEN(_PyToken_OneChar(c)); |
1390 | 501k | } |
1391 | | |
1392 | | static int |
1393 | | tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token) |
1394 | 51.4k | { |
1395 | 51.4k | const char *p_start = NULL; |
1396 | 51.4k | const char *p_end = NULL; |
1397 | 51.4k | int end_quote_size = 0; |
1398 | 51.4k | int unicode_escape = 0; |
1399 | | |
1400 | 51.4k | tok->start = tok->cur; |
1401 | 51.4k | tok->first_lineno = tok->lineno; |
1402 | 51.4k | tok->starting_col_offset = tok->col_offset; |
1403 | | |
1404 | | // If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize |
1405 | | // before it. |
1406 | 51.4k | int start_char = tok_nextc(tok); |
1407 | 51.4k | if (start_char == '{') { |
1408 | 13.7k | int peek1 = tok_nextc(tok); |
1409 | 13.7k | tok_backup(tok, peek1); |
1410 | 13.7k | tok_backup(tok, start_char); |
1411 | 13.7k | if (peek1 != '{') { |
1412 | 11.8k | current_tok->curly_bracket_expr_start_depth++; |
1413 | 11.8k | if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { |
1414 | 3 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1415 | 3 | "%c-string: expressions nested too deeply", TOK_GET_STRING_PREFIX(tok))); |
1416 | 3 | } |
1417 | 11.8k | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1418 | 11.8k | return tok_get_normal_mode(tok, current_tok, token); |
1419 | 11.8k | } |
1420 | 13.7k | } |
1421 | 37.6k | else { |
1422 | 37.6k | tok_backup(tok, start_char); |
1423 | 37.6k | } |
1424 | | |
1425 | | // Check if we are at the end of the string |
1426 | 56.8k | for (int i = 0; i < current_tok->quote_size; i++) { |
1427 | 44.6k | int quote = tok_nextc(tok); |
1428 | 44.6k | if (quote != current_tok->quote) { |
1429 | 27.4k | tok_backup(tok, quote); |
1430 | 27.4k | goto f_string_middle; |
1431 | 27.4k | } |
1432 | 44.6k | } |
1433 | | |
1434 | 12.1k | if (current_tok->last_expr_buffer != NULL) { |
1435 | 7.34k | PyMem_Free(current_tok->last_expr_buffer); |
1436 | 7.34k | current_tok->last_expr_buffer = NULL; |
1437 | 7.34k | current_tok->last_expr_size = 0; |
1438 | 7.34k | current_tok->last_expr_end = -1; |
1439 | 7.34k | } |
1440 | | |
1441 | 12.1k | p_start = tok->start; |
1442 | 12.1k | p_end = tok->cur; |
1443 | 12.1k | tok->tok_mode_stack_index--; |
1444 | 12.1k | return MAKE_TOKEN(FTSTRING_END(current_tok)); |
1445 | | |
1446 | 27.4k | f_string_middle: |
1447 | | |
1448 | | // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle |
1449 | | // this. |
1450 | 27.4k | tok->multi_line_start = tok->line_start; |
1451 | 172k | while (end_quote_size != current_tok->quote_size) { |
1452 | 167k | int c = tok_nextc(tok); |
1453 | 167k | if (tok->done == E_ERROR || tok->done == E_DECODE) { |
1454 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1455 | 0 | } |
1456 | 167k | int in_format_spec = ( |
1457 | 167k | current_tok->in_format_spec |
1458 | 7.77k | && |
1459 | 7.77k | INSIDE_FSTRING_EXPR(current_tok) |
1460 | 167k | ); |
1461 | | |
1462 | 167k | if (c == EOF || (current_tok->quote_size == 1 && c == '\n')) { |
1463 | 374 | if (tok->decoding_erred) { |
1464 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1465 | 0 | } |
1466 | | |
1467 | | // If we are in a format spec and we found a newline, |
1468 | | // it means that the format spec ends here and we should |
1469 | | // return to the regular mode. |
1470 | 374 | if (in_format_spec && c == '\n') { |
1471 | 52 | if (current_tok->quote_size == 1) { |
1472 | 52 | return MAKE_TOKEN( |
1473 | 52 | _PyTokenizer_syntaxerror( |
1474 | 52 | tok, |
1475 | 52 | "%c-string: newlines are not allowed in format specifiers for single quoted %c-strings", |
1476 | 52 | TOK_GET_STRING_PREFIX(tok), TOK_GET_STRING_PREFIX(tok) |
1477 | 52 | ) |
1478 | 52 | ); |
1479 | 52 | } |
1480 | 0 | tok_backup(tok, c); |
1481 | 0 | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1482 | 0 | current_tok->in_format_spec = 0; |
1483 | 0 | p_start = tok->start; |
1484 | 0 | p_end = tok->cur; |
1485 | 0 | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1486 | 52 | } |
1487 | | |
1488 | 374 | assert(tok->multi_line_start != NULL); |
1489 | | // shift the tok_state's location into |
1490 | | // the start of string, and report the error |
1491 | | // from the initial quote character |
1492 | 322 | tok->cur = (char *)current_tok->start; |
1493 | 322 | tok->cur++; |
1494 | 322 | tok->line_start = current_tok->multi_line_start; |
1495 | 322 | int start = tok->lineno; |
1496 | | |
1497 | 322 | tokenizer_mode *the_current_tok = TOK_GET_MODE(tok); |
1498 | 322 | tok->lineno = the_current_tok->first_line; |
1499 | | |
1500 | 322 | if (current_tok->quote_size == 3) { |
1501 | 31 | _PyTokenizer_syntaxerror(tok, |
1502 | 31 | "unterminated triple-quoted %c-string literal" |
1503 | 31 | " (detected at line %d)", |
1504 | 31 | TOK_GET_STRING_PREFIX(tok), start); |
1505 | 31 | if (c != '\n') { |
1506 | 31 | tok->done = E_EOFS; |
1507 | 31 | } |
1508 | 31 | return MAKE_TOKEN(ERRORTOKEN); |
1509 | 31 | } |
1510 | 291 | else { |
1511 | 291 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1512 | 291 | "unterminated %c-string literal (detected at" |
1513 | 291 | " line %d)", TOK_GET_STRING_PREFIX(tok), start)); |
1514 | 291 | } |
1515 | 322 | } |
1516 | | |
1517 | 166k | if (c == current_tok->quote) { |
1518 | 10.5k | end_quote_size += 1; |
1519 | 10.5k | continue; |
1520 | 156k | } else { |
1521 | 156k | end_quote_size = 0; |
1522 | 156k | } |
1523 | | |
1524 | 156k | if (c == '{') { |
1525 | 17.8k | if (!_PyLexer_update_ftstring_expr(tok, c)) { |
1526 | 0 | return MAKE_TOKEN(ENDMARKER); |
1527 | 0 | } |
1528 | 17.8k | int peek = tok_nextc(tok); |
1529 | 17.8k | if (peek != '{' || in_format_spec) { |
1530 | 15.5k | tok_backup(tok, peek); |
1531 | 15.5k | tok_backup(tok, c); |
1532 | 15.5k | current_tok->curly_bracket_expr_start_depth++; |
1533 | 15.5k | if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { |
1534 | 4 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1535 | 4 | "%c-string: expressions nested too deeply", TOK_GET_STRING_PREFIX(tok))); |
1536 | 4 | } |
1537 | 15.5k | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1538 | 15.5k | current_tok->in_format_spec = 0; |
1539 | 15.5k | p_start = tok->start; |
1540 | 15.5k | p_end = tok->cur; |
1541 | 15.5k | } else { |
1542 | 2.24k | p_start = tok->start; |
1543 | 2.24k | p_end = tok->cur - 1; |
1544 | 2.24k | } |
1545 | 17.8k | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1546 | 138k | } else if (c == '}') { |
1547 | 4.07k | if (unicode_escape) { |
1548 | 244 | p_start = tok->start; |
1549 | 244 | p_end = tok->cur; |
1550 | 244 | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1551 | 244 | } |
1552 | 3.83k | int peek = tok_nextc(tok); |
1553 | | |
1554 | | // The tokenizer can only be in the format spec if we have already completed the expression |
1555 | | // scanning (indicated by the end of the expression being set) and we are not at the top level |
1556 | | // of the bracket stack (-1 is the top level). Since format specifiers can't legally use double |
1557 | | // brackets, we can bypass it here. |
1558 | 3.83k | int cursor = current_tok->curly_bracket_depth; |
1559 | 3.83k | if (peek == '}' && !in_format_spec && cursor == 0) { |
1560 | 1.30k | p_start = tok->start; |
1561 | 1.30k | p_end = tok->cur - 1; |
1562 | 2.52k | } else { |
1563 | 2.52k | tok_backup(tok, peek); |
1564 | 2.52k | tok_backup(tok, c); |
1565 | 2.52k | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1566 | 2.52k | current_tok->in_format_spec = 0; |
1567 | 2.52k | p_start = tok->start; |
1568 | 2.52k | p_end = tok->cur; |
1569 | 2.52k | } |
1570 | 3.83k | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1571 | 134k | } else if (c == '\\') { |
1572 | 6.56k | int peek = tok_nextc(tok); |
1573 | 6.56k | if (peek == '\r') { |
1574 | 18 | peek = tok_nextc(tok); |
1575 | 18 | } |
1576 | | // Special case when the backslash is right before a curly |
1577 | | // brace. We have to restore and return the control back |
1578 | | // to the loop for the next iteration. |
1579 | 6.56k | if (peek == '{' || peek == '}') { |
1580 | 1.25k | if (!current_tok->raw) { |
1581 | 1.05k | if (_PyTokenizer_warn_invalid_escape_sequence(tok, peek)) { |
1582 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1583 | 0 | } |
1584 | 1.05k | } |
1585 | 1.25k | tok_backup(tok, peek); |
1586 | 1.25k | continue; |
1587 | 1.25k | } |
1588 | | |
1589 | 5.30k | if (!current_tok->raw) { |
1590 | 4.82k | if (peek == 'N') { |
1591 | | /* Handle named unicode escapes (\N{BULLET}) */ |
1592 | 322 | peek = tok_nextc(tok); |
1593 | 322 | if (peek == '{') { |
1594 | 253 | unicode_escape = 1; |
1595 | 253 | } else { |
1596 | 69 | tok_backup(tok, peek); |
1597 | 69 | } |
1598 | 322 | } |
1599 | 4.82k | } /* else { |
1600 | | skip the escaped character |
1601 | | }*/ |
1602 | 5.30k | } |
1603 | 156k | } |
1604 | | |
1605 | | // Backup the f-string quotes to emit a final FSTRING_MIDDLE and |
1606 | | // add the quotes to the FSTRING_END in the next tokenizer iteration. |
1607 | 11.0k | for (int i = 0; i < current_tok->quote_size; i++) { |
1608 | 5.91k | tok_backup(tok, current_tok->quote); |
1609 | 5.91k | } |
1610 | 5.16k | p_start = tok->start; |
1611 | 5.16k | p_end = tok->cur; |
1612 | 5.16k | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1613 | 27.4k | } |
1614 | | |
1615 | | static int |
1616 | | tok_get(struct tok_state *tok, struct token *token) |
1617 | 1.19M | { |
1618 | 1.19M | tokenizer_mode *current_tok = TOK_GET_MODE(tok); |
1619 | 1.19M | if (current_tok->kind == TOK_REGULAR_MODE) { |
1620 | 1.14M | return tok_get_normal_mode(tok, current_tok, token); |
1621 | 1.14M | } else { |
1622 | 51.4k | return tok_get_fstring_mode(tok, current_tok, token); |
1623 | 51.4k | } |
1624 | 1.19M | } |
1625 | | |
1626 | | int |
1627 | | _PyTokenizer_Get(struct tok_state *tok, struct token *token) |
1628 | 1.19M | { |
1629 | 1.19M | int result = tok_get(tok, token); |
1630 | 1.19M | if (tok->decoding_erred) { |
1631 | 0 | result = ERRORTOKEN; |
1632 | 0 | tok->done = E_DECODE; |
1633 | 0 | } |
1634 | 1.19M | return result; |
1635 | 1.19M | } |