/src/cpython/Parser/lexer/lexer.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "Python.h" |
2 | | #include "pycore_token.h" |
3 | | #include "pycore_unicodeobject.h" |
4 | | #include "errcode.h" |
5 | | |
6 | | #include "state.h" |
7 | | #include "../tokenizer/helpers.h" |
8 | | |
9 | | /* Alternate tab spacing */ |
10 | 1.62k | #define ALTTABSIZE 1 |
11 | | |
12 | 1.78M | #define is_potential_identifier_start(c) (\ |
13 | 1.78M | (c >= 'a' && c <= 'z')\ |
14 | 1.78M | || (c >= 'A' && c <= 'Z')\ |
15 | 1.78M | || c == '_'\ |
16 | 1.78M | || (c >= 128)) |
17 | | |
18 | 2.40M | #define is_potential_identifier_char(c) (\ |
19 | 2.40M | (c >= 'a' && c <= 'z')\ |
20 | 2.40M | || (c >= 'A' && c <= 'Z')\ |
21 | 2.40M | || (c >= '0' && c <= '9')\ |
22 | 2.40M | || c == '_'\ |
23 | 2.40M | || (c >= 128)) |
24 | | |
25 | | #ifdef Py_DEBUG |
26 | | static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) { |
27 | | assert(tok->tok_mode_stack_index >= 0); |
28 | | assert(tok->tok_mode_stack_index < MAXFSTRINGLEVEL); |
29 | | return &(tok->tok_mode_stack[tok->tok_mode_stack_index]); |
30 | | } |
31 | | static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) { |
32 | | assert(tok->tok_mode_stack_index >= 0); |
33 | | assert(tok->tok_mode_stack_index + 1 < MAXFSTRINGLEVEL); |
34 | | return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]); |
35 | | } |
36 | | #else |
37 | 1.91M | #define TOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index])) |
38 | 17.2k | #define TOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index])) |
39 | | #endif |
40 | | |
41 | | #define FTSTRING_MIDDLE(tok_mode) (tok_mode->string_kind == TSTRING ? TSTRING_MIDDLE : FSTRING_MIDDLE) |
42 | | #define FTSTRING_END(tok_mode) (tok_mode->string_kind == TSTRING ? TSTRING_END : FSTRING_END) |
43 | 35 | #define TOK_GET_STRING_PREFIX(tok) (TOK_GET_MODE(tok)->string_kind == TSTRING ? 't' : 'f') |
44 | 1.79M | #define MAKE_TOKEN(token_type) _PyLexer_token_setup(tok, token, token_type, p_start, p_end) |
45 | 0 | #define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\ |
46 | 0 | _PyLexer_type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end)) |
47 | | |
48 | | /* Spaces in this constant are treated as "zero or more spaces or tabs" when |
49 | | tokenizing. */ |
50 | | static const char* type_comment_prefix = "# type: "; |
51 | | |
52 | | static inline int |
53 | | contains_null_bytes(const char* str, size_t size) |
54 | 228k | { |
55 | 228k | return memchr(str, 0, size) != NULL; |
56 | 228k | } |
57 | | |
58 | | /* Get next char, updating state; error code goes into tok->done */ |
59 | | static int |
60 | | tok_nextc(struct tok_state *tok) |
61 | 10.9M | { |
62 | 10.9M | int rc; |
63 | 11.1M | for (;;) { |
64 | 11.1M | if (tok->cur != tok->inp) { |
65 | 10.9M | if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) { |
66 | 0 | tok->done = E_COLUMNOVERFLOW; |
67 | 0 | return EOF; |
68 | 0 | } |
69 | 10.9M | tok->col_offset++; |
70 | 10.9M | return Py_CHARMASK(*tok->cur++); /* Fast path */ |
71 | 10.9M | } |
72 | 280k | if (tok->done != E_OK) { |
73 | 34.4k | return EOF; |
74 | 34.4k | } |
75 | 245k | rc = tok->underflow(tok); |
76 | | #if defined(Py_DEBUG) |
77 | | if (tok->debug) { |
78 | | fprintf(stderr, "line[%d] = ", tok->lineno); |
79 | | _PyTokenizer_print_escape(stderr, tok->cur, tok->inp - tok->cur); |
80 | | fprintf(stderr, " tok->done = %d\n", tok->done); |
81 | | } |
82 | | #endif |
83 | 245k | if (!rc) { |
84 | 17.3k | tok->cur = tok->inp; |
85 | 17.3k | return EOF; |
86 | 17.3k | } |
87 | 228k | tok->line_start = tok->cur; |
88 | | |
89 | 228k | if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) { |
90 | 0 | _PyTokenizer_syntaxerror(tok, "source code cannot contain null bytes"); |
91 | 0 | tok->cur = tok->inp; |
92 | 0 | return EOF; |
93 | 0 | } |
94 | 228k | } |
95 | 10.9M | Py_UNREACHABLE(); |
96 | 10.9M | } |
97 | | |
98 | | /* Back-up one character */ |
99 | | static void |
100 | | tok_backup(struct tok_state *tok, int c) |
101 | 3.76M | { |
102 | 3.76M | if (c != EOF) { |
103 | 3.73M | if (--tok->cur < tok->buf) { |
104 | 0 | Py_FatalError("tokenizer beginning of buffer"); |
105 | 0 | } |
106 | 3.73M | if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) { |
107 | 0 | Py_FatalError("tok_backup: wrong character"); |
108 | 0 | } |
109 | 3.73M | tok->col_offset--; |
110 | 3.73M | } |
111 | 3.76M | } |
112 | | |
113 | | static int |
114 | 22.9k | set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { |
115 | 22.9k | assert(token != NULL); |
116 | 22.9k | assert(c == '}' || c == ':' || c == '!'); |
117 | 22.9k | tokenizer_mode *tok_mode = TOK_GET_MODE(tok); |
118 | | |
119 | 22.9k | if (!(tok_mode->in_debug || tok_mode->string_kind == TSTRING) || token->metadata) { |
120 | 13.1k | return 0; |
121 | 13.1k | } |
122 | 9.83k | PyObject *res = NULL; |
123 | | |
124 | | // Look for a # character outside of string literals |
125 | 9.83k | int hash_detected = 0; |
126 | 9.83k | int in_string = 0; |
127 | 9.83k | char quote_char = 0; |
128 | | |
129 | 1.91M | for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) { |
130 | 1.90M | char ch = tok_mode->last_expr_buffer[i]; |
131 | | |
132 | | // Skip escaped characters |
133 | 1.90M | if (ch == '\\') { |
134 | 38.3k | i++; |
135 | 38.3k | continue; |
136 | 38.3k | } |
137 | | |
138 | | // Handle quotes |
139 | 1.86M | if (ch == '"' || ch == '\'') { |
140 | | // The following if/else block works becase there is an off number |
141 | | // of quotes in STRING tokens and the lexer only ever reaches this |
142 | | // function with valid STRING tokens. |
143 | | // For example: """hello""" |
144 | | // First quote: in_string = 1 |
145 | | // Second quote: in_string = 0 |
146 | | // Third quote: in_string = 1 |
147 | 219k | if (!in_string) { |
148 | 76.7k | in_string = 1; |
149 | 76.7k | quote_char = ch; |
150 | 76.7k | } |
151 | 142k | else if (ch == quote_char) { |
152 | 75.5k | in_string = 0; |
153 | 75.5k | } |
154 | 219k | continue; |
155 | 219k | } |
156 | | |
157 | | // Check for # outside strings |
158 | 1.64M | if (ch == '#' && !in_string) { |
159 | 841 | hash_detected = 1; |
160 | 841 | break; |
161 | 841 | } |
162 | 1.64M | } |
163 | | // If we found a # character in the expression, we need to handle comments |
164 | 9.83k | if (hash_detected) { |
165 | | // Allocate buffer for processed result |
166 | 841 | char *result = (char *)PyMem_Malloc((tok_mode->last_expr_size - tok_mode->last_expr_end + 1) * sizeof(char)); |
167 | 841 | if (!result) { |
168 | 0 | return -1; |
169 | 0 | } |
170 | | |
171 | 841 | Py_ssize_t i = 0; // Input position |
172 | 841 | Py_ssize_t j = 0; // Output position |
173 | 841 | in_string = 0; // Whether we're in a string |
174 | 841 | quote_char = 0; // Current string quote char |
175 | | |
176 | | // Process each character |
177 | 61.0k | while (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { |
178 | 60.1k | char ch = tok_mode->last_expr_buffer[i]; |
179 | | |
180 | | // Handle string quotes |
181 | 60.1k | if (ch == '"' || ch == '\'') { |
182 | | // See comment above to understand this part |
183 | 8.79k | if (!in_string) { |
184 | 3.48k | in_string = 1; |
185 | 3.48k | quote_char = ch; |
186 | 5.31k | } else if (ch == quote_char) { |
187 | 3.46k | in_string = 0; |
188 | 3.46k | } |
189 | 8.79k | result[j++] = ch; |
190 | 8.79k | } |
191 | | // Skip comments |
192 | 51.3k | else if (ch == '#' && !in_string) { |
193 | 55.6k | while (i < tok_mode->last_expr_size - tok_mode->last_expr_end && |
194 | 55.6k | tok_mode->last_expr_buffer[i] != '\n') { |
195 | 54.5k | i++; |
196 | 54.5k | } |
197 | 1.03k | if (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { |
198 | 287 | result[j++] = '\n'; |
199 | 287 | } |
200 | 1.03k | } |
201 | | // Copy other chars |
202 | 50.3k | else { |
203 | 50.3k | result[j++] = ch; |
204 | 50.3k | } |
205 | 60.1k | i++; |
206 | 60.1k | } |
207 | | |
208 | 841 | result[j] = '\0'; // Null-terminate the result string |
209 | 841 | res = PyUnicode_DecodeUTF8(result, j, NULL); |
210 | 841 | PyMem_Free(result); |
211 | 8.99k | } else { |
212 | 8.99k | res = PyUnicode_DecodeUTF8( |
213 | 8.99k | tok_mode->last_expr_buffer, |
214 | 8.99k | tok_mode->last_expr_size - tok_mode->last_expr_end, |
215 | 8.99k | NULL |
216 | 8.99k | ); |
217 | 8.99k | } |
218 | | |
219 | 9.83k | if (!res) { |
220 | 10 | return -1; |
221 | 10 | } |
222 | 9.82k | token->metadata = res; |
223 | 9.82k | return 0; |
224 | 9.83k | } |
225 | | |
226 | | int |
227 | | _PyLexer_update_ftstring_expr(struct tok_state *tok, char cur) |
228 | 64.7k | { |
229 | 64.7k | assert(tok->cur != NULL); |
230 | | |
231 | 64.7k | Py_ssize_t size = strlen(tok->cur); |
232 | 64.7k | tokenizer_mode *tok_mode = TOK_GET_MODE(tok); |
233 | | |
234 | 64.7k | switch (cur) { |
235 | 0 | case 0: |
236 | 0 | if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) { |
237 | 0 | return 1; |
238 | 0 | } |
239 | 0 | char *new_buffer = PyMem_Realloc( |
240 | 0 | tok_mode->last_expr_buffer, |
241 | 0 | tok_mode->last_expr_size + size |
242 | 0 | ); |
243 | 0 | if (new_buffer == NULL) { |
244 | 0 | PyMem_Free(tok_mode->last_expr_buffer); |
245 | 0 | goto error; |
246 | 0 | } |
247 | 0 | tok_mode->last_expr_buffer = new_buffer; |
248 | 0 | strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size); |
249 | 0 | tok_mode->last_expr_size += size; |
250 | 0 | break; |
251 | 41.7k | case '{': |
252 | 41.7k | if (tok_mode->last_expr_buffer != NULL) { |
253 | 29.7k | PyMem_Free(tok_mode->last_expr_buffer); |
254 | 29.7k | } |
255 | 41.7k | tok_mode->last_expr_buffer = PyMem_Malloc(size); |
256 | 41.7k | if (tok_mode->last_expr_buffer == NULL) { |
257 | 0 | goto error; |
258 | 0 | } |
259 | 41.7k | tok_mode->last_expr_size = size; |
260 | 41.7k | tok_mode->last_expr_end = -1; |
261 | 41.7k | strncpy(tok_mode->last_expr_buffer, tok->cur, size); |
262 | 41.7k | break; |
263 | 18.4k | case '}': |
264 | 20.1k | case '!': |
265 | 20.1k | tok_mode->last_expr_end = strlen(tok->start); |
266 | 20.1k | break; |
267 | 2.86k | case ':': |
268 | 2.86k | if (tok_mode->last_expr_end == -1) { |
269 | 2.60k | tok_mode->last_expr_end = strlen(tok->start); |
270 | 2.60k | } |
271 | 2.86k | break; |
272 | 0 | default: |
273 | 0 | Py_UNREACHABLE(); |
274 | 64.7k | } |
275 | 64.7k | return 1; |
276 | 0 | error: |
277 | 0 | tok->done = E_NOMEM; |
278 | 0 | return 0; |
279 | 64.7k | } |
280 | | |
281 | | static int |
282 | | lookahead(struct tok_state *tok, const char *test) |
283 | 7.64k | { |
284 | 7.64k | const char *s = test; |
285 | 7.64k | int res = 0; |
286 | 20.2k | while (1) { |
287 | 20.2k | int c = tok_nextc(tok); |
288 | 20.2k | if (*s == 0) { |
289 | 7.55k | res = !is_potential_identifier_char(c); |
290 | 7.55k | } |
291 | 12.7k | else if (c == *s) { |
292 | 12.6k | s++; |
293 | 12.6k | continue; |
294 | 12.6k | } |
295 | | |
296 | 7.64k | tok_backup(tok, c); |
297 | 20.2k | while (s != test) { |
298 | 12.6k | tok_backup(tok, *--s); |
299 | 12.6k | } |
300 | 7.64k | return res; |
301 | 20.2k | } |
302 | 7.64k | } |
303 | | |
304 | | static int |
305 | 104k | verify_end_of_number(struct tok_state *tok, int c, const char *kind) { |
306 | 104k | if (tok->tok_extra_tokens) { |
307 | | // When we are parsing extra tokens, we don't want to emit warnings |
308 | | // about invalid literals, because we want to be a bit more liberal. |
309 | 0 | return 1; |
310 | 0 | } |
311 | | /* Emit a deprecation warning only if the numeric literal is immediately |
312 | | * followed by one of keywords which can occur after a numeric literal |
313 | | * in valid code: "and", "else", "for", "if", "in", "is" and "or". |
314 | | * It allows to gradually deprecate existing valid code without adding |
315 | | * warning before error in most cases of invalid numeric literal (which |
316 | | * would be confusing and break existing tests). |
317 | | * Raise a syntax error with slightly better message than plain |
318 | | * "invalid syntax" if the numeric literal is immediately followed by |
319 | | * other keyword or identifier. |
320 | | */ |
321 | 104k | int r = 0; |
322 | 104k | if (c == 'a') { |
323 | 751 | r = lookahead(tok, "nd"); |
324 | 751 | } |
325 | 103k | else if (c == 'e') { |
326 | 426 | r = lookahead(tok, "lse"); |
327 | 426 | } |
328 | 103k | else if (c == 'f') { |
329 | 3.31k | r = lookahead(tok, "or"); |
330 | 3.31k | } |
331 | 100k | else if (c == 'i') { |
332 | 2.34k | int c2 = tok_nextc(tok); |
333 | 2.34k | if (c2 == 'f' || c2 == 'n' || c2 == 's') { |
334 | 2.33k | r = 1; |
335 | 2.33k | } |
336 | 2.34k | tok_backup(tok, c2); |
337 | 2.34k | } |
338 | 97.8k | else if (c == 'o') { |
339 | 2.86k | r = lookahead(tok, "r"); |
340 | 2.86k | } |
341 | 94.9k | else if (c == 'n') { |
342 | 287 | r = lookahead(tok, "ot"); |
343 | 287 | } |
344 | 104k | if (r) { |
345 | 9.87k | tok_backup(tok, c); |
346 | 9.87k | if (_PyTokenizer_parser_warn(tok, PyExc_SyntaxWarning, |
347 | 9.87k | "invalid %s literal", kind)) |
348 | 0 | { |
349 | 0 | return 0; |
350 | 0 | } |
351 | 9.87k | tok_nextc(tok); |
352 | 9.87k | } |
353 | 94.7k | else /* In future releases, only error will remain. */ |
354 | 94.7k | if (c < 128 && is_potential_identifier_char(c)) { |
355 | 201 | tok_backup(tok, c); |
356 | 201 | _PyTokenizer_syntaxerror(tok, "invalid %s literal", kind); |
357 | 201 | return 0; |
358 | 201 | } |
359 | 104k | return 1; |
360 | 104k | } |
361 | | |
362 | | /* Verify that the identifier follows PEP 3131. */ |
363 | | static int |
364 | | verify_identifier(struct tok_state *tok) |
365 | 14.1k | { |
366 | 14.1k | if (tok->tok_extra_tokens) { |
367 | 0 | return 1; |
368 | 0 | } |
369 | 14.1k | PyObject *s; |
370 | 14.1k | if (tok->decoding_erred) |
371 | 0 | return 0; |
372 | 14.1k | s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); |
373 | 14.1k | if (s == NULL) { |
374 | 1.07k | if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { |
375 | 1.07k | tok->done = E_DECODE; |
376 | 1.07k | } |
377 | 0 | else { |
378 | 0 | tok->done = E_ERROR; |
379 | 0 | } |
380 | 1.07k | return 0; |
381 | 1.07k | } |
382 | 13.0k | Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s); |
383 | 13.0k | assert(invalid >= 0); |
384 | 13.0k | assert(PyUnicode_GET_LENGTH(s) > 0); |
385 | 13.0k | if (invalid < PyUnicode_GET_LENGTH(s)) { |
386 | 707 | Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid); |
387 | 707 | if (invalid + 1 < PyUnicode_GET_LENGTH(s)) { |
388 | | /* Determine the offset in UTF-8 encoded input */ |
389 | 479 | Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1)); |
390 | 479 | if (s != NULL) { |
391 | 479 | Py_SETREF(s, PyUnicode_AsUTF8String(s)); |
392 | 479 | } |
393 | 479 | if (s == NULL) { |
394 | 0 | tok->done = E_ERROR; |
395 | 0 | return 0; |
396 | 0 | } |
397 | 479 | tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s); |
398 | 479 | } |
399 | 707 | Py_DECREF(s); |
400 | 707 | if (Py_UNICODE_ISPRINTABLE(ch)) { |
401 | 394 | _PyTokenizer_syntaxerror(tok, "invalid character '%c' (U+%04X)", ch, ch); |
402 | 394 | } |
403 | 313 | else { |
404 | 313 | _PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", ch); |
405 | 313 | } |
406 | 707 | return 0; |
407 | 707 | } |
408 | 12.3k | Py_DECREF(s); |
409 | 12.3k | return 1; |
410 | 13.0k | } |
411 | | |
412 | | static int |
413 | | tok_decimal_tail(struct tok_state *tok) |
414 | 83.8k | { |
415 | 83.8k | int c; |
416 | | |
417 | 84.4k | while (1) { |
418 | 234k | do { |
419 | 234k | c = tok_nextc(tok); |
420 | 234k | } while (Py_ISDIGIT(c)); |
421 | 84.4k | if (c != '_') { |
422 | 83.8k | break; |
423 | 83.8k | } |
424 | 531 | c = tok_nextc(tok); |
425 | 531 | if (!Py_ISDIGIT(c)) { |
426 | 17 | tok_backup(tok, c); |
427 | 17 | _PyTokenizer_syntaxerror(tok, "invalid decimal literal"); |
428 | 17 | return 0; |
429 | 17 | } |
430 | 531 | } |
431 | 83.8k | return c; |
432 | 83.8k | } |
433 | | |
434 | | static inline int |
435 | 1.09k | tok_continuation_line(struct tok_state *tok) { |
436 | 1.09k | int c = tok_nextc(tok); |
437 | 1.09k | if (c == '\r') { |
438 | 72 | c = tok_nextc(tok); |
439 | 72 | } |
440 | 1.09k | if (c != '\n') { |
441 | 63 | tok->done = E_LINECONT; |
442 | 63 | return -1; |
443 | 63 | } |
444 | 1.03k | c = tok_nextc(tok); |
445 | 1.03k | if (c == EOF) { |
446 | 49 | tok->done = E_EOF; |
447 | 49 | tok->cur = tok->inp; |
448 | 49 | return -1; |
449 | 982 | } else { |
450 | 982 | tok_backup(tok, c); |
451 | 982 | } |
452 | 982 | return c; |
453 | 1.03k | } |
454 | | |
455 | | static int |
456 | | maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok, |
457 | | int saw_b, int saw_r, int saw_u, |
458 | 22.7k | int saw_f, int saw_t) { |
459 | | // Supported: rb, rf, rt (in any order) |
460 | | // Unsupported: ub, ur, uf, ut, bf, bt, ft (in any order) |
461 | | |
462 | 22.7k | #define RETURN_SYNTAX_ERROR(PREFIX1, PREFIX2) \ |
463 | 22.7k | do { \ |
464 | 7 | (void)_PyTokenizer_syntaxerror_known_range( \ |
465 | 7 | tok, (int)(tok->start + 1 - tok->line_start), \ |
466 | 7 | (int)(tok->cur - tok->line_start), \ |
467 | 7 | "'" PREFIX1 "' and '" PREFIX2 "' prefixes are incompatible"); \ |
468 | 7 | return -1; \ |
469 | 7 | } while (0) |
470 | | |
471 | 22.7k | if (saw_u && saw_b) { |
472 | 1 | RETURN_SYNTAX_ERROR("u", "b"); |
473 | 1 | } |
474 | 22.7k | if (saw_u && saw_r) { |
475 | 1 | RETURN_SYNTAX_ERROR("u", "r"); |
476 | 1 | } |
477 | 22.7k | if (saw_u && saw_f) { |
478 | 1 | RETURN_SYNTAX_ERROR("u", "f"); |
479 | 1 | } |
480 | 22.7k | if (saw_u && saw_t) { |
481 | 1 | RETURN_SYNTAX_ERROR("u", "t"); |
482 | 1 | } |
483 | | |
484 | 22.7k | if (saw_b && saw_f) { |
485 | 1 | RETURN_SYNTAX_ERROR("b", "f"); |
486 | 1 | } |
487 | 22.7k | if (saw_b && saw_t) { |
488 | 1 | RETURN_SYNTAX_ERROR("b", "t"); |
489 | 1 | } |
490 | | |
491 | 22.7k | if (saw_f && saw_t) { |
492 | 1 | RETURN_SYNTAX_ERROR("f", "t"); |
493 | 1 | } |
494 | | |
495 | 22.7k | #undef RETURN_SYNTAX_ERROR |
496 | | |
497 | 22.7k | return 0; |
498 | 22.7k | } |
499 | | |
500 | | static int |
501 | | tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token) |
502 | 1.75M | { |
503 | 1.75M | int c; |
504 | 1.75M | int blankline, nonascii; |
505 | | |
506 | 1.75M | const char *p_start = NULL; |
507 | 1.75M | const char *p_end = NULL; |
508 | 1.84M | nextline: |
509 | 1.84M | tok->start = NULL; |
510 | 1.84M | tok->starting_col_offset = -1; |
511 | 1.84M | blankline = 0; |
512 | | |
513 | | |
514 | | /* Get indentation level */ |
515 | 1.84M | if (tok->atbol) { |
516 | 228k | int col = 0; |
517 | 228k | int altcol = 0; |
518 | 228k | tok->atbol = 0; |
519 | 228k | int cont_line_col = 0; |
520 | 915k | for (;;) { |
521 | 915k | c = tok_nextc(tok); |
522 | 915k | if (c == ' ') { |
523 | 683k | col++, altcol++; |
524 | 683k | } |
525 | 231k | else if (c == '\t') { |
526 | 813 | col = (col / tok->tabsize + 1) * tok->tabsize; |
527 | 813 | altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE; |
528 | 813 | } |
529 | 230k | else if (c == '\014') {/* Control-L (formfeed) */ |
530 | 1.95k | col = altcol = 0; /* For Emacs users */ |
531 | 1.95k | } |
532 | 228k | else if (c == '\\') { |
533 | | // Indentation cannot be split over multiple physical lines |
534 | | // using backslashes. This means that if we found a backslash |
535 | | // preceded by whitespace, **the first one we find** determines |
536 | | // the level of indentation of whatever comes next. |
537 | 659 | cont_line_col = cont_line_col ? cont_line_col : col; |
538 | 659 | if ((c = tok_continuation_line(tok)) == -1) { |
539 | 42 | return MAKE_TOKEN(ERRORTOKEN); |
540 | 42 | } |
541 | 659 | } |
542 | 228k | else { |
543 | 228k | break; |
544 | 228k | } |
545 | 915k | } |
546 | 228k | tok_backup(tok, c); |
547 | 228k | if (c == '#' || c == '\n' || c == '\r') { |
548 | | /* Lines with only whitespace and/or comments |
549 | | shouldn't affect the indentation and are |
550 | | not passed to the parser as NEWLINE tokens, |
551 | | except *totally* empty lines in interactive |
552 | | mode, which signal the end of a command group. */ |
553 | 45.0k | if (col == 0 && c == '\n' && tok->prompt != NULL) { |
554 | 0 | blankline = 0; /* Let it through */ |
555 | 0 | } |
556 | 45.0k | else if (tok->prompt != NULL && tok->lineno == 1) { |
557 | | /* In interactive mode, if the first line contains |
558 | | only spaces and/or a comment, let it through. */ |
559 | 0 | blankline = 0; |
560 | 0 | col = altcol = 0; |
561 | 0 | } |
562 | 45.0k | else { |
563 | 45.0k | blankline = 1; /* Ignore completely */ |
564 | 45.0k | } |
565 | | /* We can't jump back right here since we still |
566 | | may need to skip to the end of a comment */ |
567 | 45.0k | } |
568 | 228k | if (!blankline && tok->level == 0) { |
569 | 142k | col = cont_line_col ? cont_line_col : col; |
570 | 142k | altcol = cont_line_col ? cont_line_col : altcol; |
571 | 142k | if (col == tok->indstack[tok->indent]) { |
572 | | /* No change */ |
573 | 103k | if (altcol != tok->altindstack[tok->indent]) { |
574 | 1 | return MAKE_TOKEN(_PyTokenizer_indenterror(tok)); |
575 | 1 | } |
576 | 103k | } |
577 | 38.4k | else if (col > tok->indstack[tok->indent]) { |
578 | | /* Indent -- always one */ |
579 | 21.5k | if (tok->indent+1 >= MAXINDENT) { |
580 | 0 | tok->done = E_TOODEEP; |
581 | 0 | tok->cur = tok->inp; |
582 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
583 | 0 | } |
584 | 21.5k | if (altcol <= tok->altindstack[tok->indent]) { |
585 | 3 | return MAKE_TOKEN(_PyTokenizer_indenterror(tok)); |
586 | 3 | } |
587 | 21.5k | tok->pendin++; |
588 | 21.5k | tok->indstack[++tok->indent] = col; |
589 | 21.5k | tok->altindstack[tok->indent] = altcol; |
590 | 21.5k | } |
591 | 16.9k | else /* col < tok->indstack[tok->indent] */ { |
592 | | /* Dedent -- any number, must be consistent */ |
593 | 37.6k | while (tok->indent > 0 && |
594 | 37.6k | col < tok->indstack[tok->indent]) { |
595 | 20.6k | tok->pendin--; |
596 | 20.6k | tok->indent--; |
597 | 20.6k | } |
598 | 16.9k | if (col != tok->indstack[tok->indent]) { |
599 | 9 | tok->done = E_DEDENT; |
600 | 9 | tok->cur = tok->inp; |
601 | 9 | return MAKE_TOKEN(ERRORTOKEN); |
602 | 9 | } |
603 | 16.9k | if (altcol != tok->altindstack[tok->indent]) { |
604 | 1 | return MAKE_TOKEN(_PyTokenizer_indenterror(tok)); |
605 | 1 | } |
606 | 16.9k | } |
607 | 142k | } |
608 | 228k | } |
609 | | |
610 | 1.84M | tok->start = tok->cur; |
611 | 1.84M | tok->starting_col_offset = tok->col_offset; |
612 | | |
613 | | /* Return pending indents/dedents */ |
614 | 1.84M | if (tok->pendin != 0) { |
615 | 42.1k | if (tok->pendin < 0) { |
616 | 20.6k | if (tok->tok_extra_tokens) { |
617 | 0 | p_start = tok->cur; |
618 | 0 | p_end = tok->cur; |
619 | 0 | } |
620 | 20.6k | tok->pendin++; |
621 | 20.6k | return MAKE_TOKEN(DEDENT); |
622 | 20.6k | } |
623 | 21.5k | else { |
624 | 21.5k | if (tok->tok_extra_tokens) { |
625 | 0 | p_start = tok->buf; |
626 | 0 | p_end = tok->cur; |
627 | 0 | } |
628 | 21.5k | tok->pendin--; |
629 | 21.5k | return MAKE_TOKEN(INDENT); |
630 | 21.5k | } |
631 | 42.1k | } |
632 | | |
633 | | /* Peek ahead at the next character */ |
634 | 1.80M | c = tok_nextc(tok); |
635 | 1.80M | tok_backup(tok, c); |
636 | | |
637 | 1.80M | again: |
638 | 1.80M | tok->start = NULL; |
639 | | /* Skip spaces */ |
640 | 2.14M | do { |
641 | 2.14M | c = tok_nextc(tok); |
642 | 2.14M | } while (c == ' ' || c == '\t' || c == '\014'); |
643 | | |
644 | | /* Set start of current token */ |
645 | 1.80M | tok->start = tok->cur == NULL ? NULL : tok->cur - 1; |
646 | 1.80M | tok->starting_col_offset = tok->col_offset - 1; |
647 | | |
648 | | /* Skip comment, unless it's a type comment */ |
649 | 1.80M | if (c == '#') { |
650 | | |
651 | 42.4k | const char* p = NULL; |
652 | 42.4k | const char *prefix, *type_start; |
653 | 42.4k | int current_starting_col_offset; |
654 | | |
655 | 1.31M | while (c != EOF && c != '\n' && c != '\r') { |
656 | 1.27M | c = tok_nextc(tok); |
657 | 1.27M | } |
658 | | |
659 | 42.4k | if (tok->tok_extra_tokens) { |
660 | 0 | p = tok->start; |
661 | 0 | } |
662 | | |
663 | 42.4k | if (tok->type_comments) { |
664 | 0 | p = tok->start; |
665 | 0 | current_starting_col_offset = tok->starting_col_offset; |
666 | 0 | prefix = type_comment_prefix; |
667 | 0 | while (*prefix && p < tok->cur) { |
668 | 0 | if (*prefix == ' ') { |
669 | 0 | while (*p == ' ' || *p == '\t') { |
670 | 0 | p++; |
671 | 0 | current_starting_col_offset++; |
672 | 0 | } |
673 | 0 | } else if (*prefix == *p) { |
674 | 0 | p++; |
675 | 0 | current_starting_col_offset++; |
676 | 0 | } else { |
677 | 0 | break; |
678 | 0 | } |
679 | | |
680 | 0 | prefix++; |
681 | 0 | } |
682 | | |
683 | | /* This is a type comment if we matched all of type_comment_prefix. */ |
684 | 0 | if (!*prefix) { |
685 | 0 | int is_type_ignore = 1; |
686 | | // +6 in order to skip the word 'ignore' |
687 | 0 | const char *ignore_end = p + 6; |
688 | 0 | const int ignore_end_col_offset = current_starting_col_offset + 6; |
689 | 0 | tok_backup(tok, c); /* don't eat the newline or EOF */ |
690 | |
|
691 | 0 | type_start = p; |
692 | | |
693 | | /* A TYPE_IGNORE is "type: ignore" followed by the end of the token |
694 | | * or anything ASCII and non-alphanumeric. */ |
695 | 0 | is_type_ignore = ( |
696 | 0 | tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0 |
697 | 0 | && !(tok->cur > ignore_end |
698 | 0 | && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0])))); |
699 | |
|
700 | 0 | if (is_type_ignore) { |
701 | 0 | p_start = ignore_end; |
702 | 0 | p_end = tok->cur; |
703 | | |
704 | | /* If this type ignore is the only thing on the line, consume the newline also. */ |
705 | 0 | if (blankline) { |
706 | 0 | tok_nextc(tok); |
707 | 0 | tok->atbol = 1; |
708 | 0 | } |
709 | 0 | return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset); |
710 | 0 | } else { |
711 | 0 | p_start = type_start; |
712 | 0 | p_end = tok->cur; |
713 | 0 | return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset); |
714 | 0 | } |
715 | 0 | } |
716 | 0 | } |
717 | 42.4k | if (tok->tok_extra_tokens) { |
718 | 0 | tok_backup(tok, c); /* don't eat the newline or EOF */ |
719 | 0 | p_start = p; |
720 | 0 | p_end = tok->cur; |
721 | 0 | tok->comment_newline = blankline; |
722 | 0 | return MAKE_TOKEN(COMMENT); |
723 | 0 | } |
724 | 42.4k | } |
725 | | |
726 | 1.80M | if (tok->done == E_INTERACT_STOP) { |
727 | 0 | return MAKE_TOKEN(ENDMARKER); |
728 | 0 | } |
729 | | |
730 | | /* Check for EOF and errors now */ |
731 | 1.80M | if (c == EOF) { |
732 | 17.2k | if (tok->level) { |
733 | 4.10k | return MAKE_TOKEN(ERRORTOKEN); |
734 | 4.10k | } |
735 | 13.1k | return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN); |
736 | 17.2k | } |
737 | | |
738 | | /* Identifier (most frequent token!) */ |
739 | 1.78M | nonascii = 0; |
740 | 1.78M | if (is_potential_identifier_start(c)) { |
741 | | /* Process the various legal combinations of b"", r"", u"", and f"". */ |
742 | 528k | int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0, saw_t = 0; |
743 | 647k | while (1) { |
744 | 647k | if (!saw_b && (c == 'b' || c == 'B')) { |
745 | 21.0k | saw_b = 1; |
746 | 21.0k | } |
747 | | /* Since this is a backwards compatibility support literal we don't |
748 | | want to support it in arbitrary order like byte literals. */ |
749 | 626k | else if (!saw_u && (c == 'u'|| c == 'U')) { |
750 | 6.88k | saw_u = 1; |
751 | 6.88k | } |
752 | | /* ur"" and ru"" are not supported */ |
753 | 619k | else if (!saw_r && (c == 'r' || c == 'R')) { |
754 | 37.9k | saw_r = 1; |
755 | 37.9k | } |
756 | 581k | else if (!saw_f && (c == 'f' || c == 'F')) { |
757 | 44.4k | saw_f = 1; |
758 | 44.4k | } |
759 | 537k | else if (!saw_t && (c == 't' || c == 'T')) { |
760 | 31.1k | saw_t = 1; |
761 | 31.1k | } |
762 | 505k | else { |
763 | 505k | break; |
764 | 505k | } |
765 | 141k | c = tok_nextc(tok); |
766 | 141k | if (c == '"' || c == '\'') { |
767 | | // Raise error on incompatible string prefixes: |
768 | 22.7k | int status = maybe_raise_syntax_error_for_string_prefixes( |
769 | 22.7k | tok, saw_b, saw_r, saw_u, saw_f, saw_t); |
770 | 22.7k | if (status < 0) { |
771 | 7 | return MAKE_TOKEN(ERRORTOKEN); |
772 | 7 | } |
773 | | |
774 | | // Handle valid f or t string creation: |
775 | 22.7k | if (saw_f || saw_t) { |
776 | 17.2k | goto f_string_quote; |
777 | 17.2k | } |
778 | 5.46k | goto letter_quote; |
779 | 22.7k | } |
780 | 141k | } |
781 | 2.30M | while (is_potential_identifier_char(c)) { |
782 | 1.79M | if (c >= 128) { |
783 | 144k | nonascii = 1; |
784 | 144k | } |
785 | 1.79M | c = tok_nextc(tok); |
786 | 1.79M | } |
787 | 505k | tok_backup(tok, c); |
788 | 505k | if (nonascii && !verify_identifier(tok)) { |
789 | 1.78k | return MAKE_TOKEN(ERRORTOKEN); |
790 | 1.78k | } |
791 | | |
792 | 504k | p_start = tok->start; |
793 | 504k | p_end = tok->cur; |
794 | | |
795 | 504k | return MAKE_TOKEN(NAME); |
796 | 505k | } |
797 | | |
798 | 1.25M | if (c == '\r') { |
799 | 442 | c = tok_nextc(tok); |
800 | 442 | } |
801 | | |
802 | | /* Newline */ |
803 | 1.25M | if (c == '\n') { |
804 | 206k | tok->atbol = 1; |
805 | 206k | if (blankline || tok->level > 0) { |
806 | 85.9k | if (tok->tok_extra_tokens) { |
807 | 0 | if (tok->comment_newline) { |
808 | 0 | tok->comment_newline = 0; |
809 | 0 | } |
810 | 0 | p_start = tok->start; |
811 | 0 | p_end = tok->cur; |
812 | 0 | return MAKE_TOKEN(NL); |
813 | 0 | } |
814 | 85.9k | goto nextline; |
815 | 85.9k | } |
816 | 120k | if (tok->comment_newline && tok->tok_extra_tokens) { |
817 | 0 | tok->comment_newline = 0; |
818 | 0 | p_start = tok->start; |
819 | 0 | p_end = tok->cur; |
820 | 0 | return MAKE_TOKEN(NL); |
821 | 0 | } |
822 | 120k | p_start = tok->start; |
823 | 120k | p_end = tok->cur - 1; /* Leave '\n' out of the string */ |
824 | 120k | tok->cont_line = 0; |
825 | 120k | return MAKE_TOKEN(NEWLINE); |
826 | 120k | } |
827 | | |
828 | | /* Period or number starting with period? */ |
829 | 1.04M | if (c == '.') { |
830 | 34.4k | c = tok_nextc(tok); |
831 | 34.4k | if (Py_ISDIGIT(c)) { |
832 | 2.87k | goto fraction; |
833 | 31.5k | } else if (c == '.') { |
834 | 3.28k | c = tok_nextc(tok); |
835 | 3.28k | if (c == '.') { |
836 | 2.50k | p_start = tok->start; |
837 | 2.50k | p_end = tok->cur; |
838 | 2.50k | return MAKE_TOKEN(ELLIPSIS); |
839 | 2.50k | } |
840 | 779 | else { |
841 | 779 | tok_backup(tok, c); |
842 | 779 | } |
843 | 779 | tok_backup(tok, '.'); |
844 | 779 | } |
845 | 28.3k | else { |
846 | 28.3k | tok_backup(tok, c); |
847 | 28.3k | } |
848 | 29.0k | p_start = tok->start; |
849 | 29.0k | p_end = tok->cur; |
850 | 29.0k | return MAKE_TOKEN(DOT); |
851 | 34.4k | } |
852 | | |
853 | | /* Number */ |
854 | 1.01M | if (Py_ISDIGIT(c)) { |
855 | 101k | if (c == '0') { |
856 | | /* Hex, octal or binary -- maybe. */ |
857 | 35.2k | c = tok_nextc(tok); |
858 | 35.2k | if (c == 'x' || c == 'X') { |
859 | | /* Hex */ |
860 | 15.8k | c = tok_nextc(tok); |
861 | 16.0k | do { |
862 | 16.0k | if (c == '_') { |
863 | 215 | c = tok_nextc(tok); |
864 | 215 | } |
865 | 16.0k | if (!Py_ISXDIGIT(c)) { |
866 | 21 | tok_backup(tok, c); |
867 | 21 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid hexadecimal literal")); |
868 | 21 | } |
869 | 78.8k | do { |
870 | 78.8k | c = tok_nextc(tok); |
871 | 78.8k | } while (Py_ISXDIGIT(c)); |
872 | 16.0k | } while (c == '_'); |
873 | 15.8k | if (!verify_end_of_number(tok, c, "hexadecimal")) { |
874 | 2 | return MAKE_TOKEN(ERRORTOKEN); |
875 | 2 | } |
876 | 15.8k | } |
877 | 19.3k | else if (c == 'o' || c == 'O') { |
878 | | /* Octal */ |
879 | 667 | c = tok_nextc(tok); |
880 | 1.19k | do { |
881 | 1.19k | if (c == '_') { |
882 | 529 | c = tok_nextc(tok); |
883 | 529 | } |
884 | 1.19k | if (c < '0' || c >= '8') { |
885 | 22 | if (Py_ISDIGIT(c)) { |
886 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
887 | 1 | "invalid digit '%c' in octal literal", c)); |
888 | 1 | } |
889 | 21 | else { |
890 | 21 | tok_backup(tok, c); |
891 | 21 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid octal literal")); |
892 | 21 | } |
893 | 22 | } |
894 | 3.26k | do { |
895 | 3.26k | c = tok_nextc(tok); |
896 | 3.26k | } while ('0' <= c && c < '8'); |
897 | 1.17k | } while (c == '_'); |
898 | 645 | if (Py_ISDIGIT(c)) { |
899 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
900 | 1 | "invalid digit '%c' in octal literal", c)); |
901 | 1 | } |
902 | 644 | if (!verify_end_of_number(tok, c, "octal")) { |
903 | 2 | return MAKE_TOKEN(ERRORTOKEN); |
904 | 2 | } |
905 | 644 | } |
906 | 18.7k | else if (c == 'b' || c == 'B') { |
907 | | /* Binary */ |
908 | 562 | c = tok_nextc(tok); |
909 | 896 | do { |
910 | 896 | if (c == '_') { |
911 | 342 | c = tok_nextc(tok); |
912 | 342 | } |
913 | 896 | if (c != '0' && c != '1') { |
914 | 19 | if (Py_ISDIGIT(c)) { |
915 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid digit '%c' in binary literal", c)); |
916 | 1 | } |
917 | 18 | else { |
918 | 18 | tok_backup(tok, c); |
919 | 18 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid binary literal")); |
920 | 18 | } |
921 | 19 | } |
922 | 4.15k | do { |
923 | 4.15k | c = tok_nextc(tok); |
924 | 4.15k | } while (c == '0' || c == '1'); |
925 | 877 | } while (c == '_'); |
926 | 543 | if (Py_ISDIGIT(c)) { |
927 | 2 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid digit '%c' in binary literal", c)); |
928 | 2 | } |
929 | 541 | if (!verify_end_of_number(tok, c, "binary")) { |
930 | 2 | return MAKE_TOKEN(ERRORTOKEN); |
931 | 2 | } |
932 | 541 | } |
933 | 18.1k | else { |
934 | 18.1k | int nonzero = 0; |
935 | | /* maybe old-style octal; c is first char of it */ |
936 | | /* in any case, allow '0' as a literal */ |
937 | 19.4k | while (1) { |
938 | 19.4k | if (c == '_') { |
939 | 93 | c = tok_nextc(tok); |
940 | 93 | if (!Py_ISDIGIT(c)) { |
941 | 3 | tok_backup(tok, c); |
942 | 3 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal")); |
943 | 3 | } |
944 | 93 | } |
945 | 19.4k | if (c != '0') { |
946 | 18.1k | break; |
947 | 18.1k | } |
948 | 1.31k | c = tok_nextc(tok); |
949 | 1.31k | } |
950 | 18.1k | char* zeros_end = tok->cur; |
951 | 18.1k | if (Py_ISDIGIT(c)) { |
952 | 613 | nonzero = 1; |
953 | 613 | c = tok_decimal_tail(tok); |
954 | 613 | if (c == 0) { |
955 | 2 | return MAKE_TOKEN(ERRORTOKEN); |
956 | 2 | } |
957 | 613 | } |
958 | 18.1k | if (c == '.') { |
959 | 875 | c = tok_nextc(tok); |
960 | 875 | goto fraction; |
961 | 875 | } |
962 | 17.2k | else if (c == 'e' || c == 'E') { |
963 | 842 | goto exponent; |
964 | 842 | } |
965 | 16.4k | else if (c == 'j' || c == 'J') { |
966 | 881 | goto imaginary; |
967 | 881 | } |
968 | 15.5k | else if (nonzero && !tok->tok_extra_tokens) { |
969 | | /* Old-style octal: now disallowed. */ |
970 | 21 | tok_backup(tok, c); |
971 | 21 | return MAKE_TOKEN(_PyTokenizer_syntaxerror_known_range( |
972 | 21 | tok, (int)(tok->start + 1 - tok->line_start), |
973 | 21 | (int)(zeros_end - tok->line_start), |
974 | 21 | "leading zeros in decimal integer " |
975 | 21 | "literals are not permitted; " |
976 | 21 | "use an 0o prefix for octal integers")); |
977 | 21 | } |
978 | 15.5k | if (!verify_end_of_number(tok, c, "decimal")) { |
979 | 28 | return MAKE_TOKEN(ERRORTOKEN); |
980 | 28 | } |
981 | 15.5k | } |
982 | 35.2k | } |
983 | 66.6k | else { |
984 | | /* Decimal */ |
985 | 66.6k | c = tok_decimal_tail(tok); |
986 | 66.6k | if (c == 0) { |
987 | 12 | return MAKE_TOKEN(ERRORTOKEN); |
988 | 12 | } |
989 | 66.6k | { |
990 | | /* Accept floating-point numbers. */ |
991 | 66.6k | if (c == '.') { |
992 | 3.96k | c = tok_nextc(tok); |
993 | 7.71k | fraction: |
994 | | /* Fraction */ |
995 | 7.71k | if (Py_ISDIGIT(c)) { |
996 | 5.77k | c = tok_decimal_tail(tok); |
997 | 5.77k | if (c == 0) { |
998 | 2 | return MAKE_TOKEN(ERRORTOKEN); |
999 | 2 | } |
1000 | 5.77k | } |
1001 | 7.71k | } |
1002 | 70.4k | if (c == 'e' || c == 'E') { |
1003 | 10.4k | int e; |
1004 | 11.2k | exponent: |
1005 | 11.2k | e = c; |
1006 | | /* Exponent part */ |
1007 | 11.2k | c = tok_nextc(tok); |
1008 | 11.2k | if (c == '+' || c == '-') { |
1009 | 4.03k | c = tok_nextc(tok); |
1010 | 4.03k | if (!Py_ISDIGIT(c)) { |
1011 | 13 | tok_backup(tok, c); |
1012 | 13 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal")); |
1013 | 13 | } |
1014 | 7.24k | } else if (!Py_ISDIGIT(c)) { |
1015 | 431 | tok_backup(tok, c); |
1016 | 431 | if (!verify_end_of_number(tok, e, "decimal")) { |
1017 | 42 | return MAKE_TOKEN(ERRORTOKEN); |
1018 | 42 | } |
1019 | 389 | tok_backup(tok, e); |
1020 | 389 | p_start = tok->start; |
1021 | 389 | p_end = tok->cur; |
1022 | 389 | return MAKE_TOKEN(NUMBER); |
1023 | 431 | } |
1024 | 10.8k | c = tok_decimal_tail(tok); |
1025 | 10.8k | if (c == 0) { |
1026 | 1 | return MAKE_TOKEN(ERRORTOKEN); |
1027 | 1 | } |
1028 | 10.8k | } |
1029 | 70.8k | if (c == 'j' || c == 'J') { |
1030 | | /* Imaginary part */ |
1031 | 4.54k | imaginary: |
1032 | 4.54k | c = tok_nextc(tok); |
1033 | 4.54k | if (!verify_end_of_number(tok, c, "imaginary")) { |
1034 | 8 | return MAKE_TOKEN(ERRORTOKEN); |
1035 | 8 | } |
1036 | 4.54k | } |
1037 | 67.1k | else if (!verify_end_of_number(tok, c, "decimal")) { |
1038 | 117 | return MAKE_TOKEN(ERRORTOKEN); |
1039 | 117 | } |
1040 | 70.8k | } |
1041 | 70.8k | } |
1042 | 104k | tok_backup(tok, c); |
1043 | 104k | p_start = tok->start; |
1044 | 104k | p_end = tok->cur; |
1045 | 104k | return MAKE_TOKEN(NUMBER); |
1046 | 101k | } |
1047 | | |
1048 | 928k | f_string_quote: |
1049 | 928k | if (((Py_TOLOWER(*tok->start) == 'f' || Py_TOLOWER(*tok->start) == 'r' || Py_TOLOWER(*tok->start) == 't') |
1050 | 928k | && (c == '\'' || c == '"'))) { |
1051 | | |
1052 | 17.2k | int quote = c; |
1053 | 17.2k | int quote_size = 1; /* 1 or 3 */ |
1054 | | |
1055 | | /* Nodes of type STRING, especially multi line strings |
1056 | | must be handled differently in order to get both |
1057 | | the starting line number and the column offset right. |
1058 | | (cf. issue 16806) */ |
1059 | 17.2k | tok->first_lineno = tok->lineno; |
1060 | 17.2k | tok->multi_line_start = tok->line_start; |
1061 | | |
1062 | | /* Find the quote size and start of string */ |
1063 | 17.2k | int after_quote = tok_nextc(tok); |
1064 | 17.2k | if (after_quote == quote) { |
1065 | 2.40k | int after_after_quote = tok_nextc(tok); |
1066 | 2.40k | if (after_after_quote == quote) { |
1067 | 809 | quote_size = 3; |
1068 | 809 | } |
1069 | 1.59k | else { |
1070 | | // TODO: Check this |
1071 | 1.59k | tok_backup(tok, after_after_quote); |
1072 | 1.59k | tok_backup(tok, after_quote); |
1073 | 1.59k | } |
1074 | 2.40k | } |
1075 | 17.2k | if (after_quote != quote) { |
1076 | 14.8k | tok_backup(tok, after_quote); |
1077 | 14.8k | } |
1078 | | |
1079 | | |
1080 | 17.2k | p_start = tok->start; |
1081 | 17.2k | p_end = tok->cur; |
1082 | 17.2k | if (tok->tok_mode_stack_index + 1 >= MAXFSTRINGLEVEL) { |
1083 | 2 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "too many nested f-strings or t-strings")); |
1084 | 2 | } |
1085 | 17.2k | tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok); |
1086 | 17.2k | the_current_tok->kind = TOK_FSTRING_MODE; |
1087 | 17.2k | the_current_tok->quote = quote; |
1088 | 17.2k | the_current_tok->quote_size = quote_size; |
1089 | 17.2k | the_current_tok->start = tok->start; |
1090 | 17.2k | the_current_tok->multi_line_start = tok->line_start; |
1091 | 17.2k | the_current_tok->first_line = tok->lineno; |
1092 | 17.2k | the_current_tok->start_offset = -1; |
1093 | 17.2k | the_current_tok->multi_line_start_offset = -1; |
1094 | 17.2k | the_current_tok->last_expr_buffer = NULL; |
1095 | 17.2k | the_current_tok->last_expr_size = 0; |
1096 | 17.2k | the_current_tok->last_expr_end = -1; |
1097 | 17.2k | the_current_tok->in_format_spec = 0; |
1098 | 17.2k | the_current_tok->in_debug = 0; |
1099 | | |
1100 | 17.2k | enum string_kind_t string_kind = FSTRING; |
1101 | 17.2k | switch (*tok->start) { |
1102 | 804 | case 'T': |
1103 | 4.61k | case 't': |
1104 | 4.61k | the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r'; |
1105 | 4.61k | string_kind = TSTRING; |
1106 | 4.61k | break; |
1107 | 1.57k | case 'F': |
1108 | 12.1k | case 'f': |
1109 | 12.1k | the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r'; |
1110 | 12.1k | break; |
1111 | 109 | case 'R': |
1112 | 512 | case 'r': |
1113 | 512 | the_current_tok->raw = 1; |
1114 | 512 | if (Py_TOLOWER(*(tok->start + 1)) == 't') { |
1115 | 204 | string_kind = TSTRING; |
1116 | 204 | } |
1117 | 512 | break; |
1118 | 0 | default: |
1119 | 0 | Py_UNREACHABLE(); |
1120 | 17.2k | } |
1121 | | |
1122 | 17.2k | the_current_tok->string_kind = string_kind; |
1123 | 17.2k | the_current_tok->curly_bracket_depth = 0; |
1124 | 17.2k | the_current_tok->curly_bracket_expr_start_depth = -1; |
1125 | 17.2k | return string_kind == TSTRING ? MAKE_TOKEN(TSTRING_START) : MAKE_TOKEN(FSTRING_START); |
1126 | 17.2k | } |
1127 | | |
1128 | 916k | letter_quote: |
1129 | | /* String */ |
1130 | 916k | if (c == '\'' || c == '"') { |
1131 | 59.9k | int quote = c; |
1132 | 59.9k | int quote_size = 1; /* 1 or 3 */ |
1133 | 59.9k | int end_quote_size = 0; |
1134 | 59.9k | int has_escaped_quote = 0; |
1135 | | |
1136 | | /* Nodes of type STRING, especially multi line strings |
1137 | | must be handled differently in order to get both |
1138 | | the starting line number and the column offset right. |
1139 | | (cf. issue 16806) */ |
1140 | 59.9k | tok->first_lineno = tok->lineno; |
1141 | 59.9k | tok->multi_line_start = tok->line_start; |
1142 | | |
1143 | | /* Find the quote size and start of string */ |
1144 | 59.9k | c = tok_nextc(tok); |
1145 | 59.9k | if (c == quote) { |
1146 | 10.8k | c = tok_nextc(tok); |
1147 | 10.8k | if (c == quote) { |
1148 | 2.54k | quote_size = 3; |
1149 | 2.54k | } |
1150 | 8.34k | else { |
1151 | 8.34k | end_quote_size = 1; /* empty string found */ |
1152 | 8.34k | } |
1153 | 10.8k | } |
1154 | 59.9k | if (c != quote) { |
1155 | 57.3k | tok_backup(tok, c); |
1156 | 57.3k | } |
1157 | | |
1158 | | /* Get rest of string */ |
1159 | 1.18M | while (end_quote_size != quote_size) { |
1160 | 1.12M | c = tok_nextc(tok); |
1161 | 1.12M | if (tok->done == E_ERROR) { |
1162 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1163 | 0 | } |
1164 | 1.12M | if (tok->done == E_DECODE) { |
1165 | 0 | break; |
1166 | 0 | } |
1167 | 1.12M | if (c == EOF || (quote_size == 1 && c == '\n')) { |
1168 | 423 | assert(tok->multi_line_start != NULL); |
1169 | | // shift the tok_state's location into |
1170 | | // the start of string, and report the error |
1171 | | // from the initial quote character |
1172 | 423 | tok->cur = (char *)tok->start; |
1173 | 423 | tok->cur++; |
1174 | 423 | tok->line_start = tok->multi_line_start; |
1175 | 423 | int start = tok->lineno; |
1176 | 423 | tok->lineno = tok->first_lineno; |
1177 | | |
1178 | 423 | if (INSIDE_FSTRING(tok)) { |
1179 | | /* When we are in an f-string, before raising the |
1180 | | * unterminated string literal error, check whether |
1181 | | * does the initial quote matches with f-strings quotes |
1182 | | * and if it is, then this must be a missing '}' token |
1183 | | * so raise the proper error */ |
1184 | 27 | tokenizer_mode *the_current_tok = TOK_GET_MODE(tok); |
1185 | 27 | if (the_current_tok->quote == quote && |
1186 | 27 | the_current_tok->quote_size == quote_size) { |
1187 | 19 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1188 | 19 | "%c-string: expecting '}'", TOK_GET_STRING_PREFIX(tok))); |
1189 | 19 | } |
1190 | 27 | } |
1191 | | |
1192 | 404 | if (quote_size == 3) { |
1193 | 17 | _PyTokenizer_syntaxerror(tok, "unterminated triple-quoted string literal" |
1194 | 17 | " (detected at line %d)", start); |
1195 | 17 | if (c != '\n') { |
1196 | 17 | tok->done = E_EOFS; |
1197 | 17 | } |
1198 | 17 | return MAKE_TOKEN(ERRORTOKEN); |
1199 | 17 | } |
1200 | 387 | else { |
1201 | 387 | if (has_escaped_quote) { |
1202 | 10 | _PyTokenizer_syntaxerror( |
1203 | 10 | tok, |
1204 | 10 | "unterminated string literal (detected at line %d); " |
1205 | 10 | "perhaps you escaped the end quote?", |
1206 | 10 | start |
1207 | 10 | ); |
1208 | 377 | } else { |
1209 | 377 | _PyTokenizer_syntaxerror( |
1210 | 377 | tok, "unterminated string literal (detected at line %d)", start |
1211 | 377 | ); |
1212 | 377 | } |
1213 | 387 | if (c != '\n') { |
1214 | 15 | tok->done = E_EOLS; |
1215 | 15 | } |
1216 | 387 | return MAKE_TOKEN(ERRORTOKEN); |
1217 | 387 | } |
1218 | 404 | } |
1219 | 1.12M | if (c == quote) { |
1220 | 58.0k | end_quote_size += 1; |
1221 | 58.0k | } |
1222 | 1.06M | else { |
1223 | 1.06M | end_quote_size = 0; |
1224 | 1.06M | if (c == '\\') { |
1225 | 29.7k | c = tok_nextc(tok); /* skip escaped char */ |
1226 | 29.7k | if (c == quote) { /* but record whether the escaped char was a quote */ |
1227 | 1.36k | has_escaped_quote = 1; |
1228 | 1.36k | } |
1229 | 29.7k | if (c == '\r') { |
1230 | 221 | c = tok_nextc(tok); |
1231 | 221 | } |
1232 | 29.7k | } |
1233 | 1.06M | } |
1234 | 1.12M | } |
1235 | | |
1236 | 59.4k | p_start = tok->start; |
1237 | 59.4k | p_end = tok->cur; |
1238 | 59.4k | return MAKE_TOKEN(STRING); |
1239 | 59.9k | } |
1240 | | |
1241 | | /* Line continuation */ |
1242 | 856k | if (c == '\\') { |
1243 | 435 | if ((c = tok_continuation_line(tok)) == -1) { |
1244 | 70 | return MAKE_TOKEN(ERRORTOKEN); |
1245 | 70 | } |
1246 | 365 | tok->cont_line = 1; |
1247 | 365 | goto again; /* Read next line */ |
1248 | 435 | } |
1249 | | |
1250 | | /* Punctuation character */ |
1251 | 856k | int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{'); |
1252 | 856k | if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) { |
1253 | | /* This code block gets executed before the curly_bracket_depth is incremented |
1254 | | * by the `{` case, so for ensuring that we are on the 0th level, we need |
1255 | | * to adjust it manually */ |
1256 | 54.4k | int cursor = current_tok->curly_bracket_depth - (c != '{'); |
1257 | 54.4k | int in_format_spec = current_tok->in_format_spec; |
1258 | 54.4k | int cursor_in_format_with_debug = |
1259 | 54.4k | cursor == 1 && (current_tok->in_debug || in_format_spec); |
1260 | 54.4k | int cursor_valid = cursor == 0 || cursor_in_format_with_debug; |
1261 | 54.4k | if ((cursor_valid) && !_PyLexer_update_ftstring_expr(tok, c)) { |
1262 | 0 | return MAKE_TOKEN(ENDMARKER); |
1263 | 0 | } |
1264 | 54.4k | if ((cursor_valid) && c != '{' && set_ftstring_expr(tok, token, c)) { |
1265 | 10 | return MAKE_TOKEN(ERRORTOKEN); |
1266 | 10 | } |
1267 | | |
1268 | 54.3k | if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) { |
1269 | 3.99k | current_tok->kind = TOK_FSTRING_MODE; |
1270 | 3.99k | current_tok->in_format_spec = 1; |
1271 | 3.99k | p_start = tok->start; |
1272 | 3.99k | p_end = tok->cur; |
1273 | 3.99k | return MAKE_TOKEN(_PyToken_OneChar(c)); |
1274 | 3.99k | } |
1275 | 54.3k | } |
1276 | | |
1277 | | /* Check for two-character token */ |
1278 | 852k | { |
1279 | 852k | int c2 = tok_nextc(tok); |
1280 | 852k | int current_token = _PyToken_TwoChars(c, c2); |
1281 | 852k | if (current_token != OP) { |
1282 | 22.9k | int c3 = tok_nextc(tok); |
1283 | 22.9k | int current_token3 = _PyToken_ThreeChars(c, c2, c3); |
1284 | 22.9k | if (current_token3 != OP) { |
1285 | 992 | current_token = current_token3; |
1286 | 992 | } |
1287 | 21.9k | else { |
1288 | 21.9k | tok_backup(tok, c3); |
1289 | 21.9k | } |
1290 | 22.9k | p_start = tok->start; |
1291 | 22.9k | p_end = tok->cur; |
1292 | 22.9k | return MAKE_TOKEN(current_token); |
1293 | 22.9k | } |
1294 | 829k | tok_backup(tok, c2); |
1295 | 829k | } |
1296 | | |
1297 | | /* Keep track of parentheses nesting level */ |
1298 | 0 | switch (c) { |
1299 | 90.2k | case '(': |
1300 | 126k | case '[': |
1301 | 174k | case '{': |
1302 | 174k | if (tok->level >= MAXLEVEL) { |
1303 | 3 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "too many nested parentheses")); |
1304 | 3 | } |
1305 | 174k | tok->parenstack[tok->level] = c; |
1306 | 174k | tok->parenlinenostack[tok->level] = tok->lineno; |
1307 | 174k | tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start); |
1308 | 174k | tok->level++; |
1309 | 174k | if (INSIDE_FSTRING(tok)) { |
1310 | 30.2k | current_tok->curly_bracket_depth++; |
1311 | 30.2k | } |
1312 | 174k | break; |
1313 | 61.6k | case ')': |
1314 | 73.1k | case ']': |
1315 | 99.3k | case '}': |
1316 | 99.3k | if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') { |
1317 | 56 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1318 | 56 | "%c-string: single '}' is not allowed", TOK_GET_STRING_PREFIX(tok))); |
1319 | 56 | } |
1320 | 99.3k | if (!tok->tok_extra_tokens && !tok->level) { |
1321 | 225 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "unmatched '%c'", c)); |
1322 | 225 | } |
1323 | 99.0k | if (tok->level > 0) { |
1324 | 99.0k | tok->level--; |
1325 | 99.0k | int opening = tok->parenstack[tok->level]; |
1326 | 99.0k | if (!tok->tok_extra_tokens && !((opening == '(' && c == ')') || |
1327 | 99.0k | (opening == '[' && c == ']') || |
1328 | 99.0k | (opening == '{' && c == '}'))) { |
1329 | | /* If the opening bracket belongs to an f-string's expression |
1330 | | part (e.g. f"{)}") and the closing bracket is an arbitrary |
1331 | | nested expression, then instead of matching a different |
1332 | | syntactical construct with it; we'll throw an unmatched |
1333 | | parentheses error. */ |
1334 | 54 | if (INSIDE_FSTRING(tok) && opening == '{') { |
1335 | 11 | assert(current_tok->curly_bracket_depth >= 0); |
1336 | 11 | int previous_bracket = current_tok->curly_bracket_depth - 1; |
1337 | 11 | if (previous_bracket == current_tok->curly_bracket_expr_start_depth) { |
1338 | 6 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1339 | 6 | "%c-string: unmatched '%c'", TOK_GET_STRING_PREFIX(tok), c)); |
1340 | 6 | } |
1341 | 11 | } |
1342 | 48 | if (tok->parenlinenostack[tok->level] != tok->lineno) { |
1343 | 2 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1344 | 2 | "closing parenthesis '%c' does not match " |
1345 | 2 | "opening parenthesis '%c' on line %d", |
1346 | 2 | c, opening, tok->parenlinenostack[tok->level])); |
1347 | 2 | } |
1348 | 46 | else { |
1349 | 46 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1350 | 46 | "closing parenthesis '%c' does not match " |
1351 | 46 | "opening parenthesis '%c'", |
1352 | 46 | c, opening)); |
1353 | 46 | } |
1354 | 48 | } |
1355 | 99.0k | } |
1356 | | |
1357 | 99.0k | if (INSIDE_FSTRING(tok)) { |
1358 | 22.2k | current_tok->curly_bracket_depth--; |
1359 | 22.2k | if (current_tok->curly_bracket_depth < 0) { |
1360 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "%c-string: unmatched '%c'", |
1361 | 1 | TOK_GET_STRING_PREFIX(tok), c)); |
1362 | 1 | } |
1363 | 22.2k | if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) { |
1364 | 20.4k | current_tok->curly_bracket_expr_start_depth--; |
1365 | 20.4k | current_tok->kind = TOK_FSTRING_MODE; |
1366 | 20.4k | current_tok->in_format_spec = 0; |
1367 | 20.4k | current_tok->in_debug = 0; |
1368 | 20.4k | } |
1369 | 22.2k | } |
1370 | 99.0k | break; |
1371 | 555k | default: |
1372 | 555k | break; |
1373 | 829k | } |
1374 | | |
1375 | 829k | if (!Py_UNICODE_ISPRINTABLE(c)) { |
1376 | 495 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", c)); |
1377 | 495 | } |
1378 | | |
1379 | 828k | if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) { |
1380 | 42.6k | current_tok->in_debug = 1; |
1381 | 42.6k | } |
1382 | | |
1383 | | /* Punctuation character */ |
1384 | 828k | p_start = tok->start; |
1385 | 828k | p_end = tok->cur; |
1386 | 828k | return MAKE_TOKEN(_PyToken_OneChar(c)); |
1387 | 829k | } |
1388 | | |
1389 | | static int |
1390 | | tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token) |
1391 | 53.0k | { |
1392 | 53.0k | const char *p_start = NULL; |
1393 | 53.0k | const char *p_end = NULL; |
1394 | 53.0k | int end_quote_size = 0; |
1395 | 53.0k | int unicode_escape = 0; |
1396 | | |
1397 | 53.0k | tok->start = tok->cur; |
1398 | 53.0k | tok->first_lineno = tok->lineno; |
1399 | 53.0k | tok->starting_col_offset = tok->col_offset; |
1400 | | |
1401 | | // If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize |
1402 | | // before it. |
1403 | 53.0k | int start_char = tok_nextc(tok); |
1404 | 53.0k | if (start_char == '{') { |
1405 | 14.2k | int peek1 = tok_nextc(tok); |
1406 | 14.2k | tok_backup(tok, peek1); |
1407 | 14.2k | tok_backup(tok, start_char); |
1408 | 14.2k | if (peek1 != '{') { |
1409 | 11.3k | current_tok->curly_bracket_expr_start_depth++; |
1410 | 11.3k | if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { |
1411 | 7 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1412 | 7 | "%c-string: expressions nested too deeply", TOK_GET_STRING_PREFIX(tok))); |
1413 | 7 | } |
1414 | 11.3k | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1415 | 11.3k | return tok_get_normal_mode(tok, current_tok, token); |
1416 | 11.3k | } |
1417 | 14.2k | } |
1418 | 38.7k | else { |
1419 | 38.7k | tok_backup(tok, start_char); |
1420 | 38.7k | } |
1421 | | |
1422 | | // Check if we are at the end of the string |
1423 | 60.0k | for (int i = 0; i < current_tok->quote_size; i++) { |
1424 | 47.6k | int quote = tok_nextc(tok); |
1425 | 47.6k | if (quote != current_tok->quote) { |
1426 | 29.2k | tok_backup(tok, quote); |
1427 | 29.2k | goto f_string_middle; |
1428 | 29.2k | } |
1429 | 47.6k | } |
1430 | | |
1431 | 12.3k | if (current_tok->last_expr_buffer != NULL) { |
1432 | 7.36k | PyMem_Free(current_tok->last_expr_buffer); |
1433 | 7.36k | current_tok->last_expr_buffer = NULL; |
1434 | 7.36k | current_tok->last_expr_size = 0; |
1435 | 7.36k | current_tok->last_expr_end = -1; |
1436 | 7.36k | } |
1437 | | |
1438 | 12.3k | p_start = tok->start; |
1439 | 12.3k | p_end = tok->cur; |
1440 | 12.3k | tok->tok_mode_stack_index--; |
1441 | 12.3k | return MAKE_TOKEN(FTSTRING_END(current_tok)); |
1442 | | |
1443 | 29.2k | f_string_middle: |
1444 | | |
1445 | | // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle |
1446 | | // this. |
1447 | 29.2k | tok->multi_line_start = tok->line_start; |
1448 | 158k | while (end_quote_size != current_tok->quote_size) { |
1449 | 152k | int c = tok_nextc(tok); |
1450 | 152k | if (tok->done == E_ERROR || tok->done == E_DECODE) { |
1451 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1452 | 0 | } |
1453 | 152k | int in_format_spec = ( |
1454 | 152k | current_tok->in_format_spec |
1455 | 152k | && |
1456 | 152k | INSIDE_FSTRING_EXPR(current_tok) |
1457 | 152k | ); |
1458 | | |
1459 | 152k | if (c == EOF || (current_tok->quote_size == 1 && c == '\n')) { |
1460 | 475 | if (tok->decoding_erred) { |
1461 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1462 | 0 | } |
1463 | | |
1464 | | // If we are in a format spec and we found a newline, |
1465 | | // it means that the format spec ends here and we should |
1466 | | // return to the regular mode. |
1467 | 475 | if (in_format_spec && c == '\n') { |
1468 | 85 | if (current_tok->quote_size == 1) { |
1469 | 85 | return MAKE_TOKEN( |
1470 | 85 | _PyTokenizer_syntaxerror( |
1471 | 85 | tok, |
1472 | 85 | "%c-string: newlines are not allowed in format specifiers for single quoted %c-strings", |
1473 | 85 | TOK_GET_STRING_PREFIX(tok), TOK_GET_STRING_PREFIX(tok) |
1474 | 85 | ) |
1475 | 85 | ); |
1476 | 85 | } |
1477 | 0 | tok_backup(tok, c); |
1478 | 0 | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1479 | 0 | current_tok->in_format_spec = 0; |
1480 | 0 | p_start = tok->start; |
1481 | 0 | p_end = tok->cur; |
1482 | 0 | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1483 | 85 | } |
1484 | | |
1485 | 390 | assert(tok->multi_line_start != NULL); |
1486 | | // shift the tok_state's location into |
1487 | | // the start of string, and report the error |
1488 | | // from the initial quote character |
1489 | 390 | tok->cur = (char *)current_tok->start; |
1490 | 390 | tok->cur++; |
1491 | 390 | tok->line_start = current_tok->multi_line_start; |
1492 | 390 | int start = tok->lineno; |
1493 | | |
1494 | 390 | tokenizer_mode *the_current_tok = TOK_GET_MODE(tok); |
1495 | 390 | tok->lineno = the_current_tok->first_line; |
1496 | | |
1497 | 390 | if (current_tok->quote_size == 3) { |
1498 | 35 | _PyTokenizer_syntaxerror(tok, |
1499 | 35 | "unterminated triple-quoted %c-string literal" |
1500 | 35 | " (detected at line %d)", |
1501 | 35 | TOK_GET_STRING_PREFIX(tok), start); |
1502 | 35 | if (c != '\n') { |
1503 | 35 | tok->done = E_EOFS; |
1504 | 35 | } |
1505 | 35 | return MAKE_TOKEN(ERRORTOKEN); |
1506 | 35 | } |
1507 | 355 | else { |
1508 | 355 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1509 | 355 | "unterminated %c-string literal (detected at" |
1510 | 355 | " line %d)", TOK_GET_STRING_PREFIX(tok), start)); |
1511 | 355 | } |
1512 | 390 | } |
1513 | | |
1514 | 152k | if (c == current_tok->quote) { |
1515 | 8.55k | end_quote_size += 1; |
1516 | 8.55k | continue; |
1517 | 143k | } else { |
1518 | 143k | end_quote_size = 0; |
1519 | 143k | } |
1520 | | |
1521 | 143k | if (c == '{') { |
1522 | 18.1k | if (!_PyLexer_update_ftstring_expr(tok, c)) { |
1523 | 0 | return MAKE_TOKEN(ENDMARKER); |
1524 | 0 | } |
1525 | 18.1k | int peek = tok_nextc(tok); |
1526 | 18.1k | if (peek != '{' || in_format_spec) { |
1527 | 14.7k | tok_backup(tok, peek); |
1528 | 14.7k | tok_backup(tok, c); |
1529 | 14.7k | current_tok->curly_bracket_expr_start_depth++; |
1530 | 14.7k | if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { |
1531 | 5 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1532 | 5 | "%c-string: expressions nested too deeply", TOK_GET_STRING_PREFIX(tok))); |
1533 | 5 | } |
1534 | 14.7k | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1535 | 14.7k | current_tok->in_format_spec = 0; |
1536 | 14.7k | p_start = tok->start; |
1537 | 14.7k | p_end = tok->cur; |
1538 | 14.7k | } else { |
1539 | 3.40k | p_start = tok->start; |
1540 | 3.40k | p_end = tok->cur - 1; |
1541 | 3.40k | } |
1542 | 18.1k | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1543 | 125k | } else if (c == '}') { |
1544 | 4.91k | if (unicode_escape) { |
1545 | 486 | p_start = tok->start; |
1546 | 486 | p_end = tok->cur; |
1547 | 486 | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1548 | 486 | } |
1549 | 4.42k | int peek = tok_nextc(tok); |
1550 | | |
1551 | | // The tokenizer can only be in the format spec if we have already completed the expression |
1552 | | // scanning (indicated by the end of the expression being set) and we are not at the top level |
1553 | | // of the bracket stack (-1 is the top level). Since format specifiers can't legally use double |
1554 | | // brackets, we can bypass it here. |
1555 | 4.42k | int cursor = current_tok->curly_bracket_depth; |
1556 | 4.42k | if (peek == '}' && !in_format_spec && cursor == 0) { |
1557 | 1.74k | p_start = tok->start; |
1558 | 1.74k | p_end = tok->cur - 1; |
1559 | 2.67k | } else { |
1560 | 2.67k | tok_backup(tok, peek); |
1561 | 2.67k | tok_backup(tok, c); |
1562 | 2.67k | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1563 | 2.67k | current_tok->in_format_spec = 0; |
1564 | 2.67k | p_start = tok->start; |
1565 | 2.67k | p_end = tok->cur; |
1566 | 2.67k | } |
1567 | 4.42k | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1568 | 120k | } else if (c == '\\') { |
1569 | 6.30k | int peek = tok_nextc(tok); |
1570 | 6.30k | if (peek == '\r') { |
1571 | 69 | peek = tok_nextc(tok); |
1572 | 69 | } |
1573 | | // Special case when the backslash is right before a curly |
1574 | | // brace. We have to restore and return the control back |
1575 | | // to the loop for the next iteration. |
1576 | 6.30k | if (peek == '{' || peek == '}') { |
1577 | 1.32k | if (!current_tok->raw) { |
1578 | 1.13k | if (_PyTokenizer_warn_invalid_escape_sequence(tok, peek)) { |
1579 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1580 | 0 | } |
1581 | 1.13k | } |
1582 | 1.32k | tok_backup(tok, peek); |
1583 | 1.32k | continue; |
1584 | 1.32k | } |
1585 | | |
1586 | 4.97k | if (!current_tok->raw) { |
1587 | 4.71k | if (peek == 'N') { |
1588 | | /* Handle named unicode escapes (\N{BULLET}) */ |
1589 | 750 | peek = tok_nextc(tok); |
1590 | 750 | if (peek == '{') { |
1591 | 506 | unicode_escape = 1; |
1592 | 506 | } else { |
1593 | 244 | tok_backup(tok, peek); |
1594 | 244 | } |
1595 | 750 | } |
1596 | 4.71k | } /* else { |
1597 | | skip the escaped character |
1598 | | }*/ |
1599 | 4.97k | } |
1600 | 143k | } |
1601 | | |
1602 | | // Backup the f-string quotes to emit a final FSTRING_MIDDLE and |
1603 | | // add the quotes to the FSTRING_END in the next tokenizer iteration. |
1604 | 12.3k | for (int i = 0; i < current_tok->quote_size; i++) { |
1605 | 6.60k | tok_backup(tok, current_tok->quote); |
1606 | 6.60k | } |
1607 | 5.72k | p_start = tok->start; |
1608 | 5.72k | p_end = tok->cur; |
1609 | 5.72k | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1610 | 29.2k | } |
1611 | | |
1612 | | static int |
1613 | | tok_get(struct tok_state *tok, struct token *token) |
1614 | 1.79M | { |
1615 | 1.79M | tokenizer_mode *current_tok = TOK_GET_MODE(tok); |
1616 | 1.79M | if (current_tok->kind == TOK_REGULAR_MODE) { |
1617 | 1.74M | return tok_get_normal_mode(tok, current_tok, token); |
1618 | 1.74M | } else { |
1619 | 53.0k | return tok_get_fstring_mode(tok, current_tok, token); |
1620 | 53.0k | } |
1621 | 1.79M | } |
1622 | | |
1623 | | int |
1624 | | _PyTokenizer_Get(struct tok_state *tok, struct token *token) |
1625 | 1.79M | { |
1626 | 1.79M | int result = tok_get(tok, token); |
1627 | 1.79M | if (tok->decoding_erred) { |
1628 | 0 | result = ERRORTOKEN; |
1629 | 0 | tok->done = E_DECODE; |
1630 | 0 | } |
1631 | 1.79M | return result; |
1632 | 1.79M | } |