/src/cpython/Parser/lexer/lexer.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "Python.h" |
2 | | #include "pycore_token.h" |
3 | | #include "pycore_unicodeobject.h" |
4 | | #include "errcode.h" |
5 | | |
6 | | #include "state.h" |
7 | | #include "../tokenizer/helpers.h" |
8 | | |
9 | | /* Alternate tab spacing */ |
10 | 1.70k | #define ALTTABSIZE 1 |
11 | | |
12 | 1.67M | #define is_potential_identifier_start(c) (\ |
13 | 1.67M | (c >= 'a' && c <= 'z')\ |
14 | 1.67M | || (c >= 'A' && c <= 'Z')\ |
15 | 1.67M | || c == '_'\ |
16 | 1.67M | || (c >= 128)) |
17 | | |
18 | 2.23M | #define is_potential_identifier_char(c) (\ |
19 | 2.23M | (c >= 'a' && c <= 'z')\ |
20 | 2.23M | || (c >= 'A' && c <= 'Z')\ |
21 | 2.23M | || (c >= '0' && c <= '9')\ |
22 | 2.23M | || c == '_'\ |
23 | 2.23M | || (c >= 128)) |
24 | | |
25 | | #ifdef Py_DEBUG |
26 | | static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) { |
27 | | assert(tok->tok_mode_stack_index >= 0); |
28 | | assert(tok->tok_mode_stack_index < MAXFSTRINGLEVEL); |
29 | | return &(tok->tok_mode_stack[tok->tok_mode_stack_index]); |
30 | | } |
31 | | static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) { |
32 | | assert(tok->tok_mode_stack_index >= 0); |
33 | | assert(tok->tok_mode_stack_index + 1 < MAXFSTRINGLEVEL); |
34 | | return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]); |
35 | | } |
36 | | #else |
37 | 1.78M | #define TOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index])) |
38 | 14.7k | #define TOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index])) |
39 | | #endif |
40 | | |
41 | | #define FTSTRING_MIDDLE(tok_mode) (tok_mode->string_kind == TSTRING ? TSTRING_MIDDLE : FSTRING_MIDDLE) |
42 | | #define FTSTRING_END(tok_mode) (tok_mode->string_kind == TSTRING ? TSTRING_END : FSTRING_END) |
43 | 28 | #define TOK_GET_STRING_PREFIX(tok) (TOK_GET_MODE(tok)->string_kind == TSTRING ? 't' : 'f') |
44 | 1.68M | #define MAKE_TOKEN(token_type) _PyLexer_token_setup(tok, token, token_type, p_start, p_end) |
45 | 0 | #define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\ |
46 | 0 | _PyLexer_type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end)) |
47 | | |
48 | | /* Spaces in this constant are treated as "zero or more spaces or tabs" when |
49 | | tokenizing. */ |
50 | | static const char* type_comment_prefix = "# type: "; |
51 | | |
52 | | static inline int |
53 | | contains_null_bytes(const char* str, size_t size) |
54 | 211k | { |
55 | 211k | return memchr(str, 0, size) != NULL; |
56 | 211k | } |
57 | | |
58 | | /* Get next char, updating state; error code goes into tok->done */ |
59 | | static int |
60 | | tok_nextc(struct tok_state *tok) |
61 | 10.3M | { |
62 | 10.3M | int rc; |
63 | 10.5M | for (;;) { |
64 | 10.5M | if (tok->cur != tok->inp) { |
65 | 10.2M | if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) { |
66 | 0 | tok->done = E_COLUMNOVERFLOW; |
67 | 0 | return EOF; |
68 | 0 | } |
69 | 10.2M | tok->col_offset++; |
70 | 10.2M | return Py_CHARMASK(*tok->cur++); /* Fast path */ |
71 | 10.2M | } |
72 | 257k | if (tok->done != E_OK) { |
73 | 30.0k | return EOF; |
74 | 30.0k | } |
75 | 227k | rc = tok->underflow(tok); |
76 | | #if defined(Py_DEBUG) |
77 | | if (tok->debug) { |
78 | | fprintf(stderr, "line[%d] = ", tok->lineno); |
79 | | _PyTokenizer_print_escape(stderr, tok->cur, tok->inp - tok->cur); |
80 | | fprintf(stderr, " tok->done = %d\n", tok->done); |
81 | | } |
82 | | #endif |
83 | 227k | if (!rc) { |
84 | 15.1k | tok->cur = tok->inp; |
85 | 15.1k | return EOF; |
86 | 15.1k | } |
87 | 211k | tok->line_start = tok->cur; |
88 | | |
89 | 211k | if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) { |
90 | 0 | _PyTokenizer_syntaxerror(tok, "source code cannot contain null bytes"); |
91 | 0 | tok->cur = tok->inp; |
92 | 0 | return EOF; |
93 | 0 | } |
94 | 211k | } |
95 | 10.3M | Py_UNREACHABLE(); |
96 | 10.3M | } |
97 | | |
98 | | /* Back-up one character */ |
99 | | static void |
100 | | tok_backup(struct tok_state *tok, int c) |
101 | 3.53M | { |
102 | 3.53M | if (c != EOF) { |
103 | 3.50M | if (--tok->cur < tok->buf) { |
104 | 0 | Py_FatalError("tokenizer beginning of buffer"); |
105 | 0 | } |
106 | 3.50M | if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) { |
107 | 0 | Py_FatalError("tok_backup: wrong character"); |
108 | 0 | } |
109 | 3.50M | tok->col_offset--; |
110 | 3.50M | } |
111 | 3.53M | } |
112 | | |
113 | | static int |
114 | 18.5k | set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { |
115 | 18.5k | assert(token != NULL); |
116 | 18.5k | assert(c == '}' || c == ':' || c == '!'); |
117 | 18.5k | tokenizer_mode *tok_mode = TOK_GET_MODE(tok); |
118 | | |
119 | 18.5k | if (!(tok_mode->in_debug || tok_mode->string_kind == TSTRING) || token->metadata) { |
120 | 11.1k | return 0; |
121 | 11.1k | } |
122 | 7.44k | PyObject *res = NULL; |
123 | | |
124 | | // Look for a # character outside of string literals |
125 | 7.44k | int hash_detected = 0; |
126 | 7.44k | int in_string = 0; |
127 | 7.44k | char quote_char = 0; |
128 | | |
129 | 1.86M | for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) { |
130 | 1.85M | char ch = tok_mode->last_expr_buffer[i]; |
131 | | |
132 | | // Skip escaped characters |
133 | 1.85M | if (ch == '\\') { |
134 | 44.6k | i++; |
135 | 44.6k | continue; |
136 | 44.6k | } |
137 | | |
138 | | // Handle quotes |
139 | 1.80M | if (ch == '"' || ch == '\'') { |
140 | | // The following if/else block works becase there is an off number |
141 | | // of quotes in STRING tokens and the lexer only ever reaches this |
142 | | // function with valid STRING tokens. |
143 | | // For example: """hello""" |
144 | | // First quote: in_string = 1 |
145 | | // Second quote: in_string = 0 |
146 | | // Third quote: in_string = 1 |
147 | 255k | if (!in_string) { |
148 | 87.1k | in_string = 1; |
149 | 87.1k | quote_char = ch; |
150 | 87.1k | } |
151 | 168k | else if (ch == quote_char) { |
152 | 86.0k | in_string = 0; |
153 | 86.0k | } |
154 | 255k | continue; |
155 | 255k | } |
156 | | |
157 | | // Check for # outside strings |
158 | 1.55M | if (ch == '#' && !in_string) { |
159 | 188 | hash_detected = 1; |
160 | 188 | break; |
161 | 188 | } |
162 | 1.55M | } |
163 | | // If we found a # character in the expression, we need to handle comments |
164 | 7.44k | if (hash_detected) { |
165 | | // Allocate buffer for processed result |
166 | 188 | char *result = (char *)PyMem_Malloc((tok_mode->last_expr_size - tok_mode->last_expr_end + 1) * sizeof(char)); |
167 | 188 | if (!result) { |
168 | 0 | return -1; |
169 | 0 | } |
170 | | |
171 | 188 | Py_ssize_t i = 0; // Input position |
172 | 188 | Py_ssize_t j = 0; // Output position |
173 | 188 | in_string = 0; // Whether we're in a string |
174 | 188 | quote_char = 0; // Current string quote char |
175 | | |
176 | | // Process each character |
177 | 30.4k | while (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { |
178 | 30.2k | char ch = tok_mode->last_expr_buffer[i]; |
179 | | |
180 | | // Handle string quotes |
181 | 30.2k | if (ch == '"' || ch == '\'') { |
182 | | // See comment above to understand this part |
183 | 4.03k | if (!in_string) { |
184 | 1.46k | in_string = 1; |
185 | 1.46k | quote_char = ch; |
186 | 2.57k | } else if (ch == quote_char) { |
187 | 1.44k | in_string = 0; |
188 | 1.44k | } |
189 | 4.03k | result[j++] = ch; |
190 | 4.03k | } |
191 | | // Skip comments |
192 | 26.2k | else if (ch == '#' && !in_string) { |
193 | 7.27k | while (i < tok_mode->last_expr_size - tok_mode->last_expr_end && |
194 | 7.27k | tok_mode->last_expr_buffer[i] != '\n') { |
195 | 7.02k | i++; |
196 | 7.02k | } |
197 | 250 | if (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { |
198 | 126 | result[j++] = '\n'; |
199 | 126 | } |
200 | 250 | } |
201 | | // Copy other chars |
202 | 25.9k | else { |
203 | 25.9k | result[j++] = ch; |
204 | 25.9k | } |
205 | 30.2k | i++; |
206 | 30.2k | } |
207 | | |
208 | 188 | result[j] = '\0'; // Null-terminate the result string |
209 | 188 | res = PyUnicode_DecodeUTF8(result, j, NULL); |
210 | 188 | PyMem_Free(result); |
211 | 7.25k | } else { |
212 | 7.25k | res = PyUnicode_DecodeUTF8( |
213 | 7.25k | tok_mode->last_expr_buffer, |
214 | 7.25k | tok_mode->last_expr_size - tok_mode->last_expr_end, |
215 | 7.25k | NULL |
216 | 7.25k | ); |
217 | 7.25k | } |
218 | | |
219 | 7.44k | if (!res) { |
220 | 12 | return -1; |
221 | 12 | } |
222 | 7.43k | token->metadata = res; |
223 | 7.43k | return 0; |
224 | 7.44k | } |
225 | | |
226 | | int |
227 | | _PyLexer_update_ftstring_expr(struct tok_state *tok, char cur) |
228 | 54.5k | { |
229 | 54.5k | assert(tok->cur != NULL); |
230 | | |
231 | 54.5k | Py_ssize_t size = strlen(tok->cur); |
232 | 54.5k | tokenizer_mode *tok_mode = TOK_GET_MODE(tok); |
233 | | |
234 | 54.5k | switch (cur) { |
235 | 0 | case 0: |
236 | 0 | if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) { |
237 | 0 | return 1; |
238 | 0 | } |
239 | 0 | char *new_buffer = PyMem_Realloc( |
240 | 0 | tok_mode->last_expr_buffer, |
241 | 0 | tok_mode->last_expr_size + size |
242 | 0 | ); |
243 | 0 | if (new_buffer == NULL) { |
244 | 0 | PyMem_Free(tok_mode->last_expr_buffer); |
245 | 0 | goto error; |
246 | 0 | } |
247 | 0 | tok_mode->last_expr_buffer = new_buffer; |
248 | 0 | strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size); |
249 | 0 | tok_mode->last_expr_size += size; |
250 | 0 | break; |
251 | 35.9k | case '{': |
252 | 35.9k | if (tok_mode->last_expr_buffer != NULL) { |
253 | 26.5k | PyMem_Free(tok_mode->last_expr_buffer); |
254 | 26.5k | } |
255 | 35.9k | tok_mode->last_expr_buffer = PyMem_Malloc(size); |
256 | 35.9k | if (tok_mode->last_expr_buffer == NULL) { |
257 | 0 | goto error; |
258 | 0 | } |
259 | 35.9k | tok_mode->last_expr_size = size; |
260 | 35.9k | tok_mode->last_expr_end = -1; |
261 | 35.9k | strncpy(tok_mode->last_expr_buffer, tok->cur, size); |
262 | 35.9k | break; |
263 | 14.9k | case '}': |
264 | 15.8k | case '!': |
265 | 15.8k | tok_mode->last_expr_end = strlen(tok->start); |
266 | 15.8k | break; |
267 | 2.71k | case ':': |
268 | 2.71k | if (tok_mode->last_expr_end == -1) { |
269 | 2.44k | tok_mode->last_expr_end = strlen(tok->start); |
270 | 2.44k | } |
271 | 2.71k | break; |
272 | 0 | default: |
273 | 0 | Py_UNREACHABLE(); |
274 | 54.5k | } |
275 | 54.5k | return 1; |
276 | 0 | error: |
277 | 0 | tok->done = E_NOMEM; |
278 | 0 | return 0; |
279 | 54.5k | } |
280 | | |
281 | | static int |
282 | | lookahead(struct tok_state *tok, const char *test) |
283 | 7.53k | { |
284 | 7.53k | const char *s = test; |
285 | 7.53k | int res = 0; |
286 | 19.7k | while (1) { |
287 | 19.7k | int c = tok_nextc(tok); |
288 | 19.7k | if (*s == 0) { |
289 | 7.42k | res = !is_potential_identifier_char(c); |
290 | 7.42k | } |
291 | 12.2k | else if (c == *s) { |
292 | 12.1k | s++; |
293 | 12.1k | continue; |
294 | 12.1k | } |
295 | | |
296 | 7.53k | tok_backup(tok, c); |
297 | 19.7k | while (s != test) { |
298 | 12.1k | tok_backup(tok, *--s); |
299 | 12.1k | } |
300 | 7.53k | return res; |
301 | 19.7k | } |
302 | 7.53k | } |
303 | | |
304 | | static int |
305 | 89.7k | verify_end_of_number(struct tok_state *tok, int c, const char *kind) { |
306 | 89.7k | if (tok->tok_extra_tokens) { |
307 | | // When we are parsing extra tokens, we don't want to emit warnings |
308 | | // about invalid literals, because we want to be a bit more liberal. |
309 | 0 | return 1; |
310 | 0 | } |
311 | | /* Emit a deprecation warning only if the numeric literal is immediately |
312 | | * followed by one of keywords which can occur after a numeric literal |
313 | | * in valid code: "and", "else", "for", "if", "in", "is" and "or". |
314 | | * It allows to gradually deprecate existing valid code without adding |
315 | | * warning before error in most cases of invalid numeric literal (which |
316 | | * would be confusing and break existing tests). |
317 | | * Raise a syntax error with slightly better message than plain |
318 | | * "invalid syntax" if the numeric literal is immediately followed by |
319 | | * other keyword or identifier. |
320 | | */ |
321 | 89.7k | int r = 0; |
322 | 89.7k | if (c == 'a') { |
323 | 885 | r = lookahead(tok, "nd"); |
324 | 885 | } |
325 | 88.8k | else if (c == 'e') { |
326 | 333 | r = lookahead(tok, "lse"); |
327 | 333 | } |
328 | 88.5k | else if (c == 'f') { |
329 | 3.02k | r = lookahead(tok, "or"); |
330 | 3.02k | } |
331 | 85.4k | else if (c == 'i') { |
332 | 2.06k | int c2 = tok_nextc(tok); |
333 | 2.06k | if (c2 == 'f' || c2 == 'n' || c2 == 's') { |
334 | 2.05k | r = 1; |
335 | 2.05k | } |
336 | 2.06k | tok_backup(tok, c2); |
337 | 2.06k | } |
338 | 83.4k | else if (c == 'o') { |
339 | 2.99k | r = lookahead(tok, "r"); |
340 | 2.99k | } |
341 | 80.4k | else if (c == 'n') { |
342 | 296 | r = lookahead(tok, "ot"); |
343 | 296 | } |
344 | 89.7k | if (r) { |
345 | 9.47k | tok_backup(tok, c); |
346 | 9.47k | if (_PyTokenizer_parser_warn(tok, PyExc_SyntaxWarning, |
347 | 9.47k | "invalid %s literal", kind)) |
348 | 0 | { |
349 | 0 | return 0; |
350 | 0 | } |
351 | 9.47k | tok_nextc(tok); |
352 | 9.47k | } |
353 | 80.2k | else /* In future releases, only error will remain. */ |
354 | 80.2k | if (c < 128 && is_potential_identifier_char(c)) { |
355 | 247 | tok_backup(tok, c); |
356 | 247 | _PyTokenizer_syntaxerror(tok, "invalid %s literal", kind); |
357 | 247 | return 0; |
358 | 247 | } |
359 | 89.4k | return 1; |
360 | 89.7k | } |
361 | | |
362 | | /* Verify that the identifier follows PEP 3131. */ |
363 | | static int |
364 | | verify_identifier(struct tok_state *tok) |
365 | 14.2k | { |
366 | 14.2k | if (tok->tok_extra_tokens) { |
367 | 0 | return 1; |
368 | 0 | } |
369 | 14.2k | PyObject *s; |
370 | 14.2k | if (tok->decoding_erred) |
371 | 0 | return 0; |
372 | 14.2k | s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); |
373 | 14.2k | if (s == NULL) { |
374 | 1.15k | if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { |
375 | 1.15k | tok->done = E_DECODE; |
376 | 1.15k | } |
377 | 0 | else { |
378 | 0 | tok->done = E_ERROR; |
379 | 0 | } |
380 | 1.15k | return 0; |
381 | 1.15k | } |
382 | 13.1k | Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s); |
383 | 13.1k | assert(invalid >= 0); |
384 | 13.1k | assert(PyUnicode_GET_LENGTH(s) > 0); |
385 | 13.1k | if (invalid < PyUnicode_GET_LENGTH(s)) { |
386 | 618 | Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid); |
387 | 618 | if (invalid + 1 < PyUnicode_GET_LENGTH(s)) { |
388 | | /* Determine the offset in UTF-8 encoded input */ |
389 | 426 | Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1)); |
390 | 426 | if (s != NULL) { |
391 | 426 | Py_SETREF(s, PyUnicode_AsUTF8String(s)); |
392 | 426 | } |
393 | 426 | if (s == NULL) { |
394 | 0 | tok->done = E_ERROR; |
395 | 0 | return 0; |
396 | 0 | } |
397 | 426 | tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s); |
398 | 426 | } |
399 | 618 | Py_DECREF(s); |
400 | 618 | if (Py_UNICODE_ISPRINTABLE(ch)) { |
401 | 349 | _PyTokenizer_syntaxerror(tok, "invalid character '%c' (U+%04X)", ch, ch); |
402 | 349 | } |
403 | 269 | else { |
404 | 269 | _PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", ch); |
405 | 269 | } |
406 | 618 | return 0; |
407 | 618 | } |
408 | 12.5k | Py_DECREF(s); |
409 | 12.5k | return 1; |
410 | 13.1k | } |
411 | | |
412 | | static int |
413 | | tok_decimal_tail(struct tok_state *tok) |
414 | 71.8k | { |
415 | 71.8k | int c; |
416 | | |
417 | 72.3k | while (1) { |
418 | 223k | do { |
419 | 223k | c = tok_nextc(tok); |
420 | 223k | } while (Py_ISDIGIT(c)); |
421 | 72.3k | if (c != '_') { |
422 | 71.8k | break; |
423 | 71.8k | } |
424 | 506 | c = tok_nextc(tok); |
425 | 506 | if (!Py_ISDIGIT(c)) { |
426 | 13 | tok_backup(tok, c); |
427 | 13 | _PyTokenizer_syntaxerror(tok, "invalid decimal literal"); |
428 | 13 | return 0; |
429 | 13 | } |
430 | 506 | } |
431 | 71.8k | return c; |
432 | 71.8k | } |
433 | | |
434 | | static inline int |
435 | 1.11k | tok_continuation_line(struct tok_state *tok) { |
436 | 1.11k | int c = tok_nextc(tok); |
437 | 1.11k | if (c == '\r') { |
438 | 69 | c = tok_nextc(tok); |
439 | 69 | } |
440 | 1.11k | if (c != '\n') { |
441 | 89 | tok->done = E_LINECONT; |
442 | 89 | return -1; |
443 | 89 | } |
444 | 1.02k | c = tok_nextc(tok); |
445 | 1.02k | if (c == EOF) { |
446 | 40 | tok->done = E_EOF; |
447 | 40 | tok->cur = tok->inp; |
448 | 40 | return -1; |
449 | 987 | } else { |
450 | 987 | tok_backup(tok, c); |
451 | 987 | } |
452 | 987 | return c; |
453 | 1.02k | } |
454 | | |
455 | | static int |
456 | | maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok, |
457 | | int saw_b, int saw_r, int saw_u, |
458 | 19.7k | int saw_f, int saw_t) { |
459 | | // Supported: rb, rf, rt (in any order) |
460 | | // Unsupported: ub, ur, uf, ut, bf, bt, ft (in any order) |
461 | | |
462 | 19.7k | #define RETURN_SYNTAX_ERROR(PREFIX1, PREFIX2) \ |
463 | 19.7k | do { \ |
464 | 8 | (void)_PyTokenizer_syntaxerror_known_range( \ |
465 | 8 | tok, (int)(tok->start + 1 - tok->line_start), \ |
466 | 8 | (int)(tok->cur - tok->line_start), \ |
467 | 8 | "'" PREFIX1 "' and '" PREFIX2 "' prefixes are incompatible"); \ |
468 | 8 | return -1; \ |
469 | 8 | } while (0) |
470 | | |
471 | 19.7k | if (saw_u && saw_b) { |
472 | 1 | RETURN_SYNTAX_ERROR("u", "b"); |
473 | 1 | } |
474 | 19.7k | if (saw_u && saw_r) { |
475 | 1 | RETURN_SYNTAX_ERROR("u", "r"); |
476 | 1 | } |
477 | 19.7k | if (saw_u && saw_f) { |
478 | 1 | RETURN_SYNTAX_ERROR("u", "f"); |
479 | 1 | } |
480 | 19.7k | if (saw_u && saw_t) { |
481 | 1 | RETURN_SYNTAX_ERROR("u", "t"); |
482 | 1 | } |
483 | | |
484 | 19.7k | if (saw_b && saw_f) { |
485 | 2 | RETURN_SYNTAX_ERROR("b", "f"); |
486 | 2 | } |
487 | 19.7k | if (saw_b && saw_t) { |
488 | 1 | RETURN_SYNTAX_ERROR("b", "t"); |
489 | 1 | } |
490 | | |
491 | 19.7k | if (saw_f && saw_t) { |
492 | 1 | RETURN_SYNTAX_ERROR("f", "t"); |
493 | 1 | } |
494 | | |
495 | 19.7k | #undef RETURN_SYNTAX_ERROR |
496 | | |
497 | 19.7k | return 0; |
498 | 19.7k | } |
499 | | |
500 | | static int |
501 | | tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token) |
502 | 1.64M | { |
503 | 1.64M | int c; |
504 | 1.64M | int blankline, nonascii; |
505 | | |
506 | 1.64M | const char *p_start = NULL; |
507 | 1.64M | const char *p_end = NULL; |
508 | 1.72M | nextline: |
509 | 1.72M | tok->start = NULL; |
510 | 1.72M | tok->starting_col_offset = -1; |
511 | 1.72M | blankline = 0; |
512 | | |
513 | | |
514 | | /* Get indentation level */ |
515 | 1.72M | if (tok->atbol) { |
516 | 211k | int col = 0; |
517 | 211k | int altcol = 0; |
518 | 211k | tok->atbol = 0; |
519 | 211k | int cont_line_col = 0; |
520 | 819k | for (;;) { |
521 | 819k | c = tok_nextc(tok); |
522 | 819k | if (c == ' ') { |
523 | 605k | col++, altcol++; |
524 | 605k | } |
525 | 214k | else if (c == '\t') { |
526 | 851 | col = (col / tok->tabsize + 1) * tok->tabsize; |
527 | 851 | altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE; |
528 | 851 | } |
529 | 213k | else if (c == '\014') {/* Control-L (formfeed) */ |
530 | 1.61k | col = altcol = 0; /* For Emacs users */ |
531 | 1.61k | } |
532 | 211k | else if (c == '\\') { |
533 | | // Indentation cannot be split over multiple physical lines |
534 | | // using backslashes. This means that if we found a backslash |
535 | | // preceded by whitespace, **the first one we find** determines |
536 | | // the level of indentation of whatever comes next. |
537 | 653 | cont_line_col = cont_line_col ? cont_line_col : col; |
538 | 653 | if ((c = tok_continuation_line(tok)) == -1) { |
539 | 33 | return MAKE_TOKEN(ERRORTOKEN); |
540 | 33 | } |
541 | 653 | } |
542 | 211k | else { |
543 | 211k | break; |
544 | 211k | } |
545 | 819k | } |
546 | 211k | tok_backup(tok, c); |
547 | 211k | if (c == '#' || c == '\n' || c == '\r') { |
548 | | /* Lines with only whitespace and/or comments |
549 | | shouldn't affect the indentation and are |
550 | | not passed to the parser as NEWLINE tokens, |
551 | | except *totally* empty lines in interactive |
552 | | mode, which signal the end of a command group. */ |
553 | 41.6k | if (col == 0 && c == '\n' && tok->prompt != NULL) { |
554 | 0 | blankline = 0; /* Let it through */ |
555 | 0 | } |
556 | 41.6k | else if (tok->prompt != NULL && tok->lineno == 1) { |
557 | | /* In interactive mode, if the first line contains |
558 | | only spaces and/or a comment, let it through. */ |
559 | 0 | blankline = 0; |
560 | 0 | col = altcol = 0; |
561 | 0 | } |
562 | 41.6k | else { |
563 | 41.6k | blankline = 1; /* Ignore completely */ |
564 | 41.6k | } |
565 | | /* We can't jump back right here since we still |
566 | | may need to skip to the end of a comment */ |
567 | 41.6k | } |
568 | 211k | if (!blankline && tok->level == 0) { |
569 | 130k | col = cont_line_col ? cont_line_col : col; |
570 | 130k | altcol = cont_line_col ? cont_line_col : altcol; |
571 | 130k | if (col == tok->indstack[tok->indent]) { |
572 | | /* No change */ |
573 | 96.4k | if (altcol != tok->altindstack[tok->indent]) { |
574 | 1 | return MAKE_TOKEN(_PyTokenizer_indenterror(tok)); |
575 | 1 | } |
576 | 96.4k | } |
577 | 33.7k | else if (col > tok->indstack[tok->indent]) { |
578 | | /* Indent -- always one */ |
579 | 18.9k | if (tok->indent+1 >= MAXINDENT) { |
580 | 0 | tok->done = E_TOODEEP; |
581 | 0 | tok->cur = tok->inp; |
582 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
583 | 0 | } |
584 | 18.9k | if (altcol <= tok->altindstack[tok->indent]) { |
585 | 3 | return MAKE_TOKEN(_PyTokenizer_indenterror(tok)); |
586 | 3 | } |
587 | 18.9k | tok->pendin++; |
588 | 18.9k | tok->indstack[++tok->indent] = col; |
589 | 18.9k | tok->altindstack[tok->indent] = altcol; |
590 | 18.9k | } |
591 | 14.8k | else /* col < tok->indstack[tok->indent] */ { |
592 | | /* Dedent -- any number, must be consistent */ |
593 | 33.0k | while (tok->indent > 0 && |
594 | 33.0k | col < tok->indstack[tok->indent]) { |
595 | 18.2k | tok->pendin--; |
596 | 18.2k | tok->indent--; |
597 | 18.2k | } |
598 | 14.8k | if (col != tok->indstack[tok->indent]) { |
599 | 11 | tok->done = E_DEDENT; |
600 | 11 | tok->cur = tok->inp; |
601 | 11 | return MAKE_TOKEN(ERRORTOKEN); |
602 | 11 | } |
603 | 14.8k | if (altcol != tok->altindstack[tok->indent]) { |
604 | 1 | return MAKE_TOKEN(_PyTokenizer_indenterror(tok)); |
605 | 1 | } |
606 | 14.8k | } |
607 | 130k | } |
608 | 211k | } |
609 | | |
610 | 1.72M | tok->start = tok->cur; |
611 | 1.72M | tok->starting_col_offset = tok->col_offset; |
612 | | |
613 | | /* Return pending indents/dedents */ |
614 | 1.72M | if (tok->pendin != 0) { |
615 | 37.1k | if (tok->pendin < 0) { |
616 | 18.2k | if (tok->tok_extra_tokens) { |
617 | 0 | p_start = tok->cur; |
618 | 0 | p_end = tok->cur; |
619 | 0 | } |
620 | 18.2k | tok->pendin++; |
621 | 18.2k | return MAKE_TOKEN(DEDENT); |
622 | 18.2k | } |
623 | 18.9k | else { |
624 | 18.9k | if (tok->tok_extra_tokens) { |
625 | 0 | p_start = tok->buf; |
626 | 0 | p_end = tok->cur; |
627 | 0 | } |
628 | 18.9k | tok->pendin--; |
629 | 18.9k | return MAKE_TOKEN(INDENT); |
630 | 18.9k | } |
631 | 37.1k | } |
632 | | |
633 | | /* Peek ahead at the next character */ |
634 | 1.69M | c = tok_nextc(tok); |
635 | 1.69M | tok_backup(tok, c); |
636 | | |
637 | 1.69M | again: |
638 | 1.69M | tok->start = NULL; |
639 | | /* Skip spaces */ |
640 | 2.02M | do { |
641 | 2.02M | c = tok_nextc(tok); |
642 | 2.02M | } while (c == ' ' || c == '\t' || c == '\014'); |
643 | | |
644 | | /* Set start of current token */ |
645 | 1.69M | tok->start = tok->cur == NULL ? NULL : tok->cur - 1; |
646 | 1.69M | tok->starting_col_offset = tok->col_offset - 1; |
647 | | |
648 | | /* Skip comment, unless it's a type comment */ |
649 | 1.69M | if (c == '#') { |
650 | | |
651 | 41.6k | const char* p = NULL; |
652 | 41.6k | const char *prefix, *type_start; |
653 | 41.6k | int current_starting_col_offset; |
654 | | |
655 | 1.27M | while (c != EOF && c != '\n' && c != '\r') { |
656 | 1.23M | c = tok_nextc(tok); |
657 | 1.23M | } |
658 | | |
659 | 41.6k | if (tok->tok_extra_tokens) { |
660 | 0 | p = tok->start; |
661 | 0 | } |
662 | | |
663 | 41.6k | if (tok->type_comments) { |
664 | 0 | p = tok->start; |
665 | 0 | current_starting_col_offset = tok->starting_col_offset; |
666 | 0 | prefix = type_comment_prefix; |
667 | 0 | while (*prefix && p < tok->cur) { |
668 | 0 | if (*prefix == ' ') { |
669 | 0 | while (*p == ' ' || *p == '\t') { |
670 | 0 | p++; |
671 | 0 | current_starting_col_offset++; |
672 | 0 | } |
673 | 0 | } else if (*prefix == *p) { |
674 | 0 | p++; |
675 | 0 | current_starting_col_offset++; |
676 | 0 | } else { |
677 | 0 | break; |
678 | 0 | } |
679 | | |
680 | 0 | prefix++; |
681 | 0 | } |
682 | | |
683 | | /* This is a type comment if we matched all of type_comment_prefix. */ |
684 | 0 | if (!*prefix) { |
685 | 0 | int is_type_ignore = 1; |
686 | | // +6 in order to skip the word 'ignore' |
687 | 0 | const char *ignore_end = p + 6; |
688 | 0 | const int ignore_end_col_offset = current_starting_col_offset + 6; |
689 | 0 | tok_backup(tok, c); /* don't eat the newline or EOF */ |
690 | |
|
691 | 0 | type_start = p; |
692 | | |
693 | | /* A TYPE_IGNORE is "type: ignore" followed by the end of the token |
694 | | * or anything ASCII and non-alphanumeric. */ |
695 | 0 | is_type_ignore = ( |
696 | 0 | tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0 |
697 | 0 | && !(tok->cur > ignore_end |
698 | 0 | && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0])))); |
699 | |
|
700 | 0 | if (is_type_ignore) { |
701 | 0 | p_start = ignore_end; |
702 | 0 | p_end = tok->cur; |
703 | | |
704 | | /* If this type ignore is the only thing on the line, consume the newline also. */ |
705 | 0 | if (blankline) { |
706 | 0 | tok_nextc(tok); |
707 | 0 | tok->atbol = 1; |
708 | 0 | } |
709 | 0 | return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset); |
710 | 0 | } else { |
711 | 0 | p_start = type_start; |
712 | 0 | p_end = tok->cur; |
713 | 0 | return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset); |
714 | 0 | } |
715 | 0 | } |
716 | 0 | } |
717 | 41.6k | if (tok->tok_extra_tokens) { |
718 | 0 | tok_backup(tok, c); /* don't eat the newline or EOF */ |
719 | 0 | p_start = p; |
720 | 0 | p_end = tok->cur; |
721 | 0 | tok->comment_newline = blankline; |
722 | 0 | return MAKE_TOKEN(COMMENT); |
723 | 0 | } |
724 | 41.6k | } |
725 | | |
726 | 1.69M | if (tok->done == E_INTERACT_STOP) { |
727 | 0 | return MAKE_TOKEN(ENDMARKER); |
728 | 0 | } |
729 | | |
730 | | /* Check for EOF and errors now */ |
731 | 1.69M | if (c == EOF) { |
732 | 15.0k | if (tok->level) { |
733 | 3.59k | return MAKE_TOKEN(ERRORTOKEN); |
734 | 3.59k | } |
735 | 11.4k | return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN); |
736 | 15.0k | } |
737 | | |
738 | | /* Identifier (most frequent token!) */ |
739 | 1.67M | nonascii = 0; |
740 | 1.67M | if (is_potential_identifier_start(c)) { |
741 | | /* Process the various legal combinations of b"", r"", u"", and f"". */ |
742 | 492k | int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0, saw_t = 0; |
743 | 605k | while (1) { |
744 | 605k | if (!saw_b && (c == 'b' || c == 'B')) { |
745 | 20.0k | saw_b = 1; |
746 | 20.0k | } |
747 | | /* Since this is a backwards compatibility support literal we don't |
748 | | want to support it in arbitrary order like byte literals. */ |
749 | 585k | else if (!saw_u && (c == 'u'|| c == 'U')) { |
750 | 6.25k | saw_u = 1; |
751 | 6.25k | } |
752 | | /* ur"" and ru"" are not supported */ |
753 | 579k | else if (!saw_r && (c == 'r' || c == 'R')) { |
754 | 35.5k | saw_r = 1; |
755 | 35.5k | } |
756 | 543k | else if (!saw_f && (c == 'f' || c == 'F')) { |
757 | 39.7k | saw_f = 1; |
758 | 39.7k | } |
759 | 504k | else if (!saw_t && (c == 't' || c == 'T')) { |
760 | 31.1k | saw_t = 1; |
761 | 31.1k | } |
762 | 472k | else { |
763 | 472k | break; |
764 | 472k | } |
765 | 132k | c = tok_nextc(tok); |
766 | 132k | if (c == '"' || c == '\'') { |
767 | | // Raise error on incompatible string prefixes: |
768 | 19.7k | int status = maybe_raise_syntax_error_for_string_prefixes( |
769 | 19.7k | tok, saw_b, saw_r, saw_u, saw_f, saw_t); |
770 | 19.7k | if (status < 0) { |
771 | 8 | return MAKE_TOKEN(ERRORTOKEN); |
772 | 8 | } |
773 | | |
774 | | // Handle valid f or t string creation: |
775 | 19.7k | if (saw_f || saw_t) { |
776 | 14.8k | goto f_string_quote; |
777 | 14.8k | } |
778 | 4.93k | goto letter_quote; |
779 | 19.7k | } |
780 | 132k | } |
781 | 2.14M | while (is_potential_identifier_char(c)) { |
782 | 1.67M | if (c >= 128) { |
783 | 192k | nonascii = 1; |
784 | 192k | } |
785 | 1.67M | c = tok_nextc(tok); |
786 | 1.67M | } |
787 | 472k | tok_backup(tok, c); |
788 | 472k | if (nonascii && !verify_identifier(tok)) { |
789 | 1.77k | return MAKE_TOKEN(ERRORTOKEN); |
790 | 1.77k | } |
791 | | |
792 | 471k | p_start = tok->start; |
793 | 471k | p_end = tok->cur; |
794 | | |
795 | 471k | return MAKE_TOKEN(NAME); |
796 | 472k | } |
797 | | |
798 | 1.18M | if (c == '\r') { |
799 | 408 | c = tok_nextc(tok); |
800 | 408 | } |
801 | | |
802 | | /* Newline */ |
803 | 1.18M | if (c == '\n') { |
804 | 192k | tok->atbol = 1; |
805 | 192k | if (blankline || tok->level > 0) { |
806 | 80.9k | if (tok->tok_extra_tokens) { |
807 | 0 | if (tok->comment_newline) { |
808 | 0 | tok->comment_newline = 0; |
809 | 0 | } |
810 | 0 | p_start = tok->start; |
811 | 0 | p_end = tok->cur; |
812 | 0 | return MAKE_TOKEN(NL); |
813 | 0 | } |
814 | 80.9k | goto nextline; |
815 | 80.9k | } |
816 | 111k | if (tok->comment_newline && tok->tok_extra_tokens) { |
817 | 0 | tok->comment_newline = 0; |
818 | 0 | p_start = tok->start; |
819 | 0 | p_end = tok->cur; |
820 | 0 | return MAKE_TOKEN(NL); |
821 | 0 | } |
822 | 111k | p_start = tok->start; |
823 | 111k | p_end = tok->cur - 1; /* Leave '\n' out of the string */ |
824 | 111k | tok->cont_line = 0; |
825 | 111k | return MAKE_TOKEN(NEWLINE); |
826 | 111k | } |
827 | | |
828 | | /* Period or number starting with period? */ |
829 | 992k | if (c == '.') { |
830 | 32.2k | c = tok_nextc(tok); |
831 | 32.2k | if (Py_ISDIGIT(c)) { |
832 | 3.11k | goto fraction; |
833 | 29.1k | } else if (c == '.') { |
834 | 3.54k | c = tok_nextc(tok); |
835 | 3.54k | if (c == '.') { |
836 | 2.90k | p_start = tok->start; |
837 | 2.90k | p_end = tok->cur; |
838 | 2.90k | return MAKE_TOKEN(ELLIPSIS); |
839 | 2.90k | } |
840 | 644 | else { |
841 | 644 | tok_backup(tok, c); |
842 | 644 | } |
843 | 644 | tok_backup(tok, '.'); |
844 | 644 | } |
845 | 25.5k | else { |
846 | 25.5k | tok_backup(tok, c); |
847 | 25.5k | } |
848 | 26.2k | p_start = tok->start; |
849 | 26.2k | p_end = tok->cur; |
850 | 26.2k | return MAKE_TOKEN(DOT); |
851 | 32.2k | } |
852 | | |
853 | | /* Number */ |
854 | 959k | if (Py_ISDIGIT(c)) { |
855 | 86.7k | if (c == '0') { |
856 | | /* Hex, octal or binary -- maybe. */ |
857 | 30.5k | c = tok_nextc(tok); |
858 | 30.5k | if (c == 'x' || c == 'X') { |
859 | | /* Hex */ |
860 | 15.7k | c = tok_nextc(tok); |
861 | 15.9k | do { |
862 | 15.9k | if (c == '_') { |
863 | 260 | c = tok_nextc(tok); |
864 | 260 | } |
865 | 15.9k | if (!Py_ISXDIGIT(c)) { |
866 | 20 | tok_backup(tok, c); |
867 | 20 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid hexadecimal literal")); |
868 | 20 | } |
869 | 69.0k | do { |
870 | 69.0k | c = tok_nextc(tok); |
871 | 69.0k | } while (Py_ISXDIGIT(c)); |
872 | 15.9k | } while (c == '_'); |
873 | 15.7k | if (!verify_end_of_number(tok, c, "hexadecimal")) { |
874 | 1 | return MAKE_TOKEN(ERRORTOKEN); |
875 | 1 | } |
876 | 15.7k | } |
877 | 14.7k | else if (c == 'o' || c == 'O') { |
878 | | /* Octal */ |
879 | 622 | c = tok_nextc(tok); |
880 | 1.23k | do { |
881 | 1.23k | if (c == '_') { |
882 | 616 | c = tok_nextc(tok); |
883 | 616 | } |
884 | 1.23k | if (c < '0' || c >= '8') { |
885 | 22 | if (Py_ISDIGIT(c)) { |
886 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
887 | 1 | "invalid digit '%c' in octal literal", c)); |
888 | 1 | } |
889 | 21 | else { |
890 | 21 | tok_backup(tok, c); |
891 | 21 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid octal literal")); |
892 | 21 | } |
893 | 22 | } |
894 | 3.39k | do { |
895 | 3.39k | c = tok_nextc(tok); |
896 | 3.39k | } while ('0' <= c && c < '8'); |
897 | 1.21k | } while (c == '_'); |
898 | 600 | if (Py_ISDIGIT(c)) { |
899 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
900 | 1 | "invalid digit '%c' in octal literal", c)); |
901 | 1 | } |
902 | 599 | if (!verify_end_of_number(tok, c, "octal")) { |
903 | 4 | return MAKE_TOKEN(ERRORTOKEN); |
904 | 4 | } |
905 | 599 | } |
906 | 14.1k | else if (c == 'b' || c == 'B') { |
907 | | /* Binary */ |
908 | 552 | c = tok_nextc(tok); |
909 | 645 | do { |
910 | 645 | if (c == '_') { |
911 | 99 | c = tok_nextc(tok); |
912 | 99 | } |
913 | 645 | if (c != '0' && c != '1') { |
914 | 21 | if (Py_ISDIGIT(c)) { |
915 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid digit '%c' in binary literal", c)); |
916 | 1 | } |
917 | 20 | else { |
918 | 20 | tok_backup(tok, c); |
919 | 20 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid binary literal")); |
920 | 20 | } |
921 | 21 | } |
922 | 3.94k | do { |
923 | 3.94k | c = tok_nextc(tok); |
924 | 3.94k | } while (c == '0' || c == '1'); |
925 | 624 | } while (c == '_'); |
926 | 531 | if (Py_ISDIGIT(c)) { |
927 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid digit '%c' in binary literal", c)); |
928 | 1 | } |
929 | 530 | if (!verify_end_of_number(tok, c, "binary")) { |
930 | 1 | return MAKE_TOKEN(ERRORTOKEN); |
931 | 1 | } |
932 | 530 | } |
933 | 13.6k | else { |
934 | 13.6k | int nonzero = 0; |
935 | | /* maybe old-style octal; c is first char of it */ |
936 | | /* in any case, allow '0' as a literal */ |
937 | 14.9k | while (1) { |
938 | 14.9k | if (c == '_') { |
939 | 102 | c = tok_nextc(tok); |
940 | 102 | if (!Py_ISDIGIT(c)) { |
941 | 6 | tok_backup(tok, c); |
942 | 6 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal")); |
943 | 6 | } |
944 | 102 | } |
945 | 14.9k | if (c != '0') { |
946 | 13.6k | break; |
947 | 13.6k | } |
948 | 1.29k | c = tok_nextc(tok); |
949 | 1.29k | } |
950 | 13.6k | char* zeros_end = tok->cur; |
951 | 13.6k | if (Py_ISDIGIT(c)) { |
952 | 547 | nonzero = 1; |
953 | 547 | c = tok_decimal_tail(tok); |
954 | 547 | if (c == 0) { |
955 | 1 | return MAKE_TOKEN(ERRORTOKEN); |
956 | 1 | } |
957 | 547 | } |
958 | 13.6k | if (c == '.') { |
959 | 715 | c = tok_nextc(tok); |
960 | 715 | goto fraction; |
961 | 715 | } |
962 | 12.8k | else if (c == 'e' || c == 'E') { |
963 | 690 | goto exponent; |
964 | 690 | } |
965 | 12.2k | else if (c == 'j' || c == 'J') { |
966 | 1.01k | goto imaginary; |
967 | 1.01k | } |
968 | 11.1k | else if (nonzero && !tok->tok_extra_tokens) { |
969 | | /* Old-style octal: now disallowed. */ |
970 | 26 | tok_backup(tok, c); |
971 | 26 | return MAKE_TOKEN(_PyTokenizer_syntaxerror_known_range( |
972 | 26 | tok, (int)(tok->start + 1 - tok->line_start), |
973 | 26 | (int)(zeros_end - tok->line_start), |
974 | 26 | "leading zeros in decimal integer " |
975 | 26 | "literals are not permitted; " |
976 | 26 | "use an 0o prefix for octal integers")); |
977 | 26 | } |
978 | 11.1k | if (!verify_end_of_number(tok, c, "decimal")) { |
979 | 35 | return MAKE_TOKEN(ERRORTOKEN); |
980 | 35 | } |
981 | 11.1k | } |
982 | 30.5k | } |
983 | 56.2k | else { |
984 | | /* Decimal */ |
985 | 56.2k | c = tok_decimal_tail(tok); |
986 | 56.2k | if (c == 0) { |
987 | 10 | return MAKE_TOKEN(ERRORTOKEN); |
988 | 10 | } |
989 | 56.2k | { |
990 | | /* Accept floating-point numbers. */ |
991 | 56.2k | if (c == '.') { |
992 | 3.76k | c = tok_nextc(tok); |
993 | 7.59k | fraction: |
994 | | /* Fraction */ |
995 | 7.59k | if (Py_ISDIGIT(c)) { |
996 | 5.87k | c = tok_decimal_tail(tok); |
997 | 5.87k | if (c == 0) { |
998 | 1 | return MAKE_TOKEN(ERRORTOKEN); |
999 | 1 | } |
1000 | 5.87k | } |
1001 | 7.59k | } |
1002 | 60.0k | if (c == 'e' || c == 'E') { |
1003 | 8.89k | int e; |
1004 | 9.58k | exponent: |
1005 | 9.58k | e = c; |
1006 | | /* Exponent part */ |
1007 | 9.58k | c = tok_nextc(tok); |
1008 | 9.58k | if (c == '+' || c == '-') { |
1009 | 3.28k | c = tok_nextc(tok); |
1010 | 3.28k | if (!Py_ISDIGIT(c)) { |
1011 | 14 | tok_backup(tok, c); |
1012 | 14 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal")); |
1013 | 14 | } |
1014 | 6.30k | } else if (!Py_ISDIGIT(c)) { |
1015 | 336 | tok_backup(tok, c); |
1016 | 336 | if (!verify_end_of_number(tok, e, "decimal")) { |
1017 | 48 | return MAKE_TOKEN(ERRORTOKEN); |
1018 | 48 | } |
1019 | 288 | tok_backup(tok, e); |
1020 | 288 | p_start = tok->start; |
1021 | 288 | p_end = tok->cur; |
1022 | 288 | return MAKE_TOKEN(NUMBER); |
1023 | 336 | } |
1024 | 9.23k | c = tok_decimal_tail(tok); |
1025 | 9.23k | if (c == 0) { |
1026 | 1 | return MAKE_TOKEN(ERRORTOKEN); |
1027 | 1 | } |
1028 | 9.23k | } |
1029 | 60.3k | if (c == 'j' || c == 'J') { |
1030 | | /* Imaginary part */ |
1031 | 4.67k | imaginary: |
1032 | 4.67k | c = tok_nextc(tok); |
1033 | 4.67k | if (!verify_end_of_number(tok, c, "imaginary")) { |
1034 | 12 | return MAKE_TOKEN(ERRORTOKEN); |
1035 | 12 | } |
1036 | 4.67k | } |
1037 | 56.7k | else if (!verify_end_of_number(tok, c, "decimal")) { |
1038 | 146 | return MAKE_TOKEN(ERRORTOKEN); |
1039 | 146 | } |
1040 | 60.3k | } |
1041 | 60.3k | } |
1042 | 89.1k | tok_backup(tok, c); |
1043 | 89.1k | p_start = tok->start; |
1044 | 89.1k | p_end = tok->cur; |
1045 | 89.1k | return MAKE_TOKEN(NUMBER); |
1046 | 86.7k | } |
1047 | | |
1048 | 888k | f_string_quote: |
1049 | 888k | if (((Py_TOLOWER(*tok->start) == 'f' || Py_TOLOWER(*tok->start) == 'r' || Py_TOLOWER(*tok->start) == 't') |
1050 | 888k | && (c == '\'' || c == '"'))) { |
1051 | | |
1052 | 14.8k | int quote = c; |
1053 | 14.8k | int quote_size = 1; /* 1 or 3 */ |
1054 | | |
1055 | | /* Nodes of type STRING, especially multi line strings |
1056 | | must be handled differently in order to get both |
1057 | | the starting line number and the column offset right. |
1058 | | (cf. issue 16806) */ |
1059 | 14.8k | tok->first_lineno = tok->lineno; |
1060 | 14.8k | tok->multi_line_start = tok->line_start; |
1061 | | |
1062 | | /* Find the quote size and start of string */ |
1063 | 14.8k | int after_quote = tok_nextc(tok); |
1064 | 14.8k | if (after_quote == quote) { |
1065 | 2.28k | int after_after_quote = tok_nextc(tok); |
1066 | 2.28k | if (after_after_quote == quote) { |
1067 | 775 | quote_size = 3; |
1068 | 775 | } |
1069 | 1.50k | else { |
1070 | | // TODO: Check this |
1071 | 1.50k | tok_backup(tok, after_after_quote); |
1072 | 1.50k | tok_backup(tok, after_quote); |
1073 | 1.50k | } |
1074 | 2.28k | } |
1075 | 14.8k | if (after_quote != quote) { |
1076 | 12.5k | tok_backup(tok, after_quote); |
1077 | 12.5k | } |
1078 | | |
1079 | | |
1080 | 14.8k | p_start = tok->start; |
1081 | 14.8k | p_end = tok->cur; |
1082 | 14.8k | if (tok->tok_mode_stack_index + 1 >= MAXFSTRINGLEVEL) { |
1083 | 2 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "too many nested f-strings or t-strings")); |
1084 | 2 | } |
1085 | 14.7k | tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok); |
1086 | 14.7k | the_current_tok->kind = TOK_FSTRING_MODE; |
1087 | 14.7k | the_current_tok->quote = quote; |
1088 | 14.7k | the_current_tok->quote_size = quote_size; |
1089 | 14.7k | the_current_tok->start = tok->start; |
1090 | 14.7k | the_current_tok->multi_line_start = tok->line_start; |
1091 | 14.7k | the_current_tok->first_line = tok->lineno; |
1092 | 14.7k | the_current_tok->start_offset = -1; |
1093 | 14.7k | the_current_tok->multi_line_start_offset = -1; |
1094 | 14.7k | the_current_tok->last_expr_buffer = NULL; |
1095 | 14.7k | the_current_tok->last_expr_size = 0; |
1096 | 14.7k | the_current_tok->last_expr_end = -1; |
1097 | 14.7k | the_current_tok->in_format_spec = 0; |
1098 | 14.7k | the_current_tok->in_debug = 0; |
1099 | | |
1100 | 14.7k | enum string_kind_t string_kind = FSTRING; |
1101 | 14.7k | switch (*tok->start) { |
1102 | 907 | case 'T': |
1103 | 3.09k | case 't': |
1104 | 3.09k | the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r'; |
1105 | 3.09k | string_kind = TSTRING; |
1106 | 3.09k | break; |
1107 | 1.74k | case 'F': |
1108 | 11.3k | case 'f': |
1109 | 11.3k | the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r'; |
1110 | 11.3k | break; |
1111 | 111 | case 'R': |
1112 | 384 | case 'r': |
1113 | 384 | the_current_tok->raw = 1; |
1114 | 384 | if (Py_TOLOWER(*(tok->start + 1)) == 't') { |
1115 | 205 | string_kind = TSTRING; |
1116 | 205 | } |
1117 | 384 | break; |
1118 | 0 | default: |
1119 | 0 | Py_UNREACHABLE(); |
1120 | 14.7k | } |
1121 | | |
1122 | 14.7k | the_current_tok->string_kind = string_kind; |
1123 | 14.7k | the_current_tok->curly_bracket_depth = 0; |
1124 | 14.7k | the_current_tok->curly_bracket_expr_start_depth = -1; |
1125 | 14.7k | return string_kind == TSTRING ? MAKE_TOKEN(TSTRING_START) : MAKE_TOKEN(FSTRING_START); |
1126 | 14.7k | } |
1127 | | |
1128 | 878k | letter_quote: |
1129 | | /* String */ |
1130 | 878k | if (c == '\'' || c == '"') { |
1131 | 56.7k | int quote = c; |
1132 | 56.7k | int quote_size = 1; /* 1 or 3 */ |
1133 | 56.7k | int end_quote_size = 0; |
1134 | 56.7k | int has_escaped_quote = 0; |
1135 | | |
1136 | | /* Nodes of type STRING, especially multi line strings |
1137 | | must be handled differently in order to get both |
1138 | | the starting line number and the column offset right. |
1139 | | (cf. issue 16806) */ |
1140 | 56.7k | tok->first_lineno = tok->lineno; |
1141 | 56.7k | tok->multi_line_start = tok->line_start; |
1142 | | |
1143 | | /* Find the quote size and start of string */ |
1144 | 56.7k | c = tok_nextc(tok); |
1145 | 56.7k | if (c == quote) { |
1146 | 9.56k | c = tok_nextc(tok); |
1147 | 9.56k | if (c == quote) { |
1148 | 2.22k | quote_size = 3; |
1149 | 2.22k | } |
1150 | 7.34k | else { |
1151 | 7.34k | end_quote_size = 1; /* empty string found */ |
1152 | 7.34k | } |
1153 | 9.56k | } |
1154 | 56.7k | if (c != quote) { |
1155 | 54.5k | tok_backup(tok, c); |
1156 | 54.5k | } |
1157 | | |
1158 | | /* Get rest of string */ |
1159 | 1.13M | while (end_quote_size != quote_size) { |
1160 | 1.07M | c = tok_nextc(tok); |
1161 | 1.07M | if (tok->done == E_ERROR) { |
1162 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1163 | 0 | } |
1164 | 1.07M | if (tok->done == E_DECODE) { |
1165 | 0 | break; |
1166 | 0 | } |
1167 | 1.07M | if (c == EOF || (quote_size == 1 && c == '\n')) { |
1168 | 442 | assert(tok->multi_line_start != NULL); |
1169 | | // shift the tok_state's location into |
1170 | | // the start of string, and report the error |
1171 | | // from the initial quote character |
1172 | 442 | tok->cur = (char *)tok->start; |
1173 | 442 | tok->cur++; |
1174 | 442 | tok->line_start = tok->multi_line_start; |
1175 | 442 | int start = tok->lineno; |
1176 | 442 | tok->lineno = tok->first_lineno; |
1177 | | |
1178 | 442 | if (INSIDE_FSTRING(tok)) { |
1179 | | /* When we are in an f-string, before raising the |
1180 | | * unterminated string literal error, check whether |
1181 | | * does the initial quote matches with f-strings quotes |
1182 | | * and if it is, then this must be a missing '}' token |
1183 | | * so raise the proper error */ |
1184 | 34 | tokenizer_mode *the_current_tok = TOK_GET_MODE(tok); |
1185 | 34 | if (the_current_tok->quote == quote && |
1186 | 34 | the_current_tok->quote_size == quote_size) { |
1187 | 21 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1188 | 21 | "%c-string: expecting '}'", TOK_GET_STRING_PREFIX(tok))); |
1189 | 21 | } |
1190 | 34 | } |
1191 | | |
1192 | 421 | if (quote_size == 3) { |
1193 | 19 | _PyTokenizer_syntaxerror(tok, "unterminated triple-quoted string literal" |
1194 | 19 | " (detected at line %d)", start); |
1195 | 19 | if (c != '\n') { |
1196 | 19 | tok->done = E_EOFS; |
1197 | 19 | } |
1198 | 19 | return MAKE_TOKEN(ERRORTOKEN); |
1199 | 19 | } |
1200 | 402 | else { |
1201 | 402 | if (has_escaped_quote) { |
1202 | 12 | _PyTokenizer_syntaxerror( |
1203 | 12 | tok, |
1204 | 12 | "unterminated string literal (detected at line %d); " |
1205 | 12 | "perhaps you escaped the end quote?", |
1206 | 12 | start |
1207 | 12 | ); |
1208 | 390 | } else { |
1209 | 390 | _PyTokenizer_syntaxerror( |
1210 | 390 | tok, "unterminated string literal (detected at line %d)", start |
1211 | 390 | ); |
1212 | 390 | } |
1213 | 402 | if (c != '\n') { |
1214 | 13 | tok->done = E_EOLS; |
1215 | 13 | } |
1216 | 402 | return MAKE_TOKEN(ERRORTOKEN); |
1217 | 402 | } |
1218 | 421 | } |
1219 | 1.07M | if (c == quote) { |
1220 | 54.9k | end_quote_size += 1; |
1221 | 54.9k | } |
1222 | 1.02M | else { |
1223 | 1.02M | end_quote_size = 0; |
1224 | 1.02M | if (c == '\\') { |
1225 | 33.6k | c = tok_nextc(tok); /* skip escaped char */ |
1226 | 33.6k | if (c == quote) { /* but record whether the escaped char was a quote */ |
1227 | 1.61k | has_escaped_quote = 1; |
1228 | 1.61k | } |
1229 | 33.6k | if (c == '\r') { |
1230 | 218 | c = tok_nextc(tok); |
1231 | 218 | } |
1232 | 33.6k | } |
1233 | 1.02M | } |
1234 | 1.07M | } |
1235 | | |
1236 | 56.3k | p_start = tok->start; |
1237 | 56.3k | p_end = tok->cur; |
1238 | 56.3k | return MAKE_TOKEN(STRING); |
1239 | 56.7k | } |
1240 | | |
1241 | | /* Line continuation */ |
1242 | 821k | if (c == '\\') { |
1243 | 463 | if ((c = tok_continuation_line(tok)) == -1) { |
1244 | 96 | return MAKE_TOKEN(ERRORTOKEN); |
1245 | 96 | } |
1246 | 367 | tok->cont_line = 1; |
1247 | 367 | goto again; /* Read next line */ |
1248 | 463 | } |
1249 | | |
1250 | | /* Punctuation character */ |
1251 | 820k | int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{'); |
1252 | 820k | if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) { |
1253 | | /* This code block gets executed before the curly_bracket_depth is incremented |
1254 | | * by the `{` case, so for ensuring that we are on the 0th level, we need |
1255 | | * to adjust it manually */ |
1256 | 45.8k | int cursor = current_tok->curly_bracket_depth - (c != '{'); |
1257 | 45.8k | int in_format_spec = current_tok->in_format_spec; |
1258 | 45.8k | int cursor_in_format_with_debug = |
1259 | 45.8k | cursor == 1 && (current_tok->in_debug || in_format_spec); |
1260 | 45.8k | int cursor_valid = cursor == 0 || cursor_in_format_with_debug; |
1261 | 45.8k | if ((cursor_valid) && !_PyLexer_update_ftstring_expr(tok, c)) { |
1262 | 0 | return MAKE_TOKEN(ENDMARKER); |
1263 | 0 | } |
1264 | 45.8k | if ((cursor_valid) && c != '{' && set_ftstring_expr(tok, token, c)) { |
1265 | 12 | return MAKE_TOKEN(ERRORTOKEN); |
1266 | 12 | } |
1267 | | |
1268 | 45.7k | if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) { |
1269 | 3.80k | current_tok->kind = TOK_FSTRING_MODE; |
1270 | 3.80k | current_tok->in_format_spec = 1; |
1271 | 3.80k | p_start = tok->start; |
1272 | 3.80k | p_end = tok->cur; |
1273 | 3.80k | return MAKE_TOKEN(_PyToken_OneChar(c)); |
1274 | 3.80k | } |
1275 | 45.7k | } |
1276 | | |
1277 | | /* Check for two-character token */ |
1278 | 817k | { |
1279 | 817k | int c2 = tok_nextc(tok); |
1280 | 817k | int current_token = _PyToken_TwoChars(c, c2); |
1281 | 817k | if (current_token != OP) { |
1282 | 20.6k | int c3 = tok_nextc(tok); |
1283 | 20.6k | int current_token3 = _PyToken_ThreeChars(c, c2, c3); |
1284 | 20.6k | if (current_token3 != OP) { |
1285 | 964 | current_token = current_token3; |
1286 | 964 | } |
1287 | 19.6k | else { |
1288 | 19.6k | tok_backup(tok, c3); |
1289 | 19.6k | } |
1290 | 20.6k | p_start = tok->start; |
1291 | 20.6k | p_end = tok->cur; |
1292 | 20.6k | return MAKE_TOKEN(current_token); |
1293 | 20.6k | } |
1294 | 796k | tok_backup(tok, c2); |
1295 | 796k | } |
1296 | | |
1297 | | /* Keep track of parentheses nesting level */ |
1298 | 0 | switch (c) { |
1299 | 83.6k | case '(': |
1300 | 115k | case '[': |
1301 | 159k | case '{': |
1302 | 159k | if (tok->level >= MAXLEVEL) { |
1303 | 8 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "too many nested parentheses")); |
1304 | 8 | } |
1305 | 159k | tok->parenstack[tok->level] = c; |
1306 | 159k | tok->parenlinenostack[tok->level] = tok->lineno; |
1307 | 159k | tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start); |
1308 | 159k | tok->level++; |
1309 | 159k | if (INSIDE_FSTRING(tok)) { |
1310 | 25.7k | current_tok->curly_bracket_depth++; |
1311 | 25.7k | } |
1312 | 159k | break; |
1313 | 55.6k | case ')': |
1314 | 66.4k | case ']': |
1315 | 89.3k | case '}': |
1316 | 89.3k | if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') { |
1317 | 44 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1318 | 44 | "%c-string: single '}' is not allowed", TOK_GET_STRING_PREFIX(tok))); |
1319 | 44 | } |
1320 | 89.2k | if (!tok->tok_extra_tokens && !tok->level) { |
1321 | 219 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "unmatched '%c'", c)); |
1322 | 219 | } |
1323 | 89.0k | if (tok->level > 0) { |
1324 | 89.0k | tok->level--; |
1325 | 89.0k | int opening = tok->parenstack[tok->level]; |
1326 | 89.0k | if (!tok->tok_extra_tokens && !((opening == '(' && c == ')') || |
1327 | 89.0k | (opening == '[' && c == ']') || |
1328 | 89.0k | (opening == '{' && c == '}'))) { |
1329 | | /* If the opening bracket belongs to an f-string's expression |
1330 | | part (e.g. f"{)}") and the closing bracket is an arbitrary |
1331 | | nested expression, then instead of matching a different |
1332 | | syntactical construct with it; we'll throw an unmatched |
1333 | | parentheses error. */ |
1334 | 46 | if (INSIDE_FSTRING(tok) && opening == '{') { |
1335 | 4 | assert(current_tok->curly_bracket_depth >= 0); |
1336 | 4 | int previous_bracket = current_tok->curly_bracket_depth - 1; |
1337 | 4 | if (previous_bracket == current_tok->curly_bracket_expr_start_depth) { |
1338 | 2 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1339 | 2 | "%c-string: unmatched '%c'", TOK_GET_STRING_PREFIX(tok), c)); |
1340 | 2 | } |
1341 | 4 | } |
1342 | 44 | if (tok->parenlinenostack[tok->level] != tok->lineno) { |
1343 | 3 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1344 | 3 | "closing parenthesis '%c' does not match " |
1345 | 3 | "opening parenthesis '%c' on line %d", |
1346 | 3 | c, opening, tok->parenlinenostack[tok->level])); |
1347 | 3 | } |
1348 | 41 | else { |
1349 | 41 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1350 | 41 | "closing parenthesis '%c' does not match " |
1351 | 41 | "opening parenthesis '%c'", |
1352 | 41 | c, opening)); |
1353 | 41 | } |
1354 | 44 | } |
1355 | 89.0k | } |
1356 | | |
1357 | 89.0k | if (INSIDE_FSTRING(tok)) { |
1358 | 18.9k | current_tok->curly_bracket_depth--; |
1359 | 18.9k | if (current_tok->curly_bracket_depth < 0) { |
1360 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "%c-string: unmatched '%c'", |
1361 | 1 | TOK_GET_STRING_PREFIX(tok), c)); |
1362 | 1 | } |
1363 | 18.9k | if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) { |
1364 | 17.2k | current_tok->curly_bracket_expr_start_depth--; |
1365 | 17.2k | current_tok->kind = TOK_FSTRING_MODE; |
1366 | 17.2k | current_tok->in_format_spec = 0; |
1367 | 17.2k | current_tok->in_debug = 0; |
1368 | 17.2k | } |
1369 | 18.9k | } |
1370 | 89.0k | break; |
1371 | 547k | default: |
1372 | 547k | break; |
1373 | 796k | } |
1374 | | |
1375 | 796k | if (!Py_UNICODE_ISPRINTABLE(c)) { |
1376 | 483 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", c)); |
1377 | 483 | } |
1378 | | |
1379 | 795k | if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) { |
1380 | 39.2k | current_tok->in_debug = 1; |
1381 | 39.2k | } |
1382 | | |
1383 | | /* Punctuation character */ |
1384 | 795k | p_start = tok->start; |
1385 | 795k | p_end = tok->cur; |
1386 | 795k | return MAKE_TOKEN(_PyToken_OneChar(c)); |
1387 | 796k | } |
1388 | | |
1389 | | static int |
1390 | | tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token) |
1391 | 45.7k | { |
1392 | 45.7k | const char *p_start = NULL; |
1393 | 45.7k | const char *p_end = NULL; |
1394 | 45.7k | int end_quote_size = 0; |
1395 | 45.7k | int unicode_escape = 0; |
1396 | | |
1397 | 45.7k | tok->start = tok->cur; |
1398 | 45.7k | tok->first_lineno = tok->lineno; |
1399 | 45.7k | tok->starting_col_offset = tok->col_offset; |
1400 | | |
1401 | | // If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize |
1402 | | // before it. |
1403 | 45.7k | int start_char = tok_nextc(tok); |
1404 | 45.7k | if (start_char == '{') { |
1405 | 10.4k | int peek1 = tok_nextc(tok); |
1406 | 10.4k | tok_backup(tok, peek1); |
1407 | 10.4k | tok_backup(tok, start_char); |
1408 | 10.4k | if (peek1 != '{') { |
1409 | 8.28k | current_tok->curly_bracket_expr_start_depth++; |
1410 | 8.28k | if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { |
1411 | 8 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1412 | 8 | "%c-string: expressions nested too deeply", TOK_GET_STRING_PREFIX(tok))); |
1413 | 8 | } |
1414 | 8.28k | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1415 | 8.28k | return tok_get_normal_mode(tok, current_tok, token); |
1416 | 8.28k | } |
1417 | 10.4k | } |
1418 | 35.2k | else { |
1419 | 35.2k | tok_backup(tok, start_char); |
1420 | 35.2k | } |
1421 | | |
1422 | | // Check if we are at the end of the string |
1423 | 53.3k | for (int i = 0; i < current_tok->quote_size; i++) { |
1424 | 42.4k | int quote = tok_nextc(tok); |
1425 | 42.4k | if (quote != current_tok->quote) { |
1426 | 26.5k | tok_backup(tok, quote); |
1427 | 26.5k | goto f_string_middle; |
1428 | 26.5k | } |
1429 | 42.4k | } |
1430 | | |
1431 | 10.9k | if (current_tok->last_expr_buffer != NULL) { |
1432 | 5.72k | PyMem_Free(current_tok->last_expr_buffer); |
1433 | 5.72k | current_tok->last_expr_buffer = NULL; |
1434 | 5.72k | current_tok->last_expr_size = 0; |
1435 | 5.72k | current_tok->last_expr_end = -1; |
1436 | 5.72k | } |
1437 | | |
1438 | 10.9k | p_start = tok->start; |
1439 | 10.9k | p_end = tok->cur; |
1440 | 10.9k | tok->tok_mode_stack_index--; |
1441 | 10.9k | return MAKE_TOKEN(FTSTRING_END(current_tok)); |
1442 | | |
1443 | 26.5k | f_string_middle: |
1444 | | |
1445 | | // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle |
1446 | | // this. |
1447 | 26.5k | tok->multi_line_start = tok->line_start; |
1448 | 158k | while (end_quote_size != current_tok->quote_size) { |
1449 | 153k | int c = tok_nextc(tok); |
1450 | 153k | if (tok->done == E_ERROR || tok->done == E_DECODE) { |
1451 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1452 | 0 | } |
1453 | 153k | int in_format_spec = ( |
1454 | 153k | current_tok->in_format_spec |
1455 | 153k | && |
1456 | 153k | INSIDE_FSTRING_EXPR(current_tok) |
1457 | 153k | ); |
1458 | | |
1459 | 153k | if (c == EOF || (current_tok->quote_size == 1 && c == '\n')) { |
1460 | 340 | if (tok->decoding_erred) { |
1461 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1462 | 0 | } |
1463 | | |
1464 | | // If we are in a format spec and we found a newline, |
1465 | | // it means that the format spec ends here and we should |
1466 | | // return to the regular mode. |
1467 | 340 | if (in_format_spec && c == '\n') { |
1468 | 75 | if (current_tok->quote_size == 1) { |
1469 | 75 | return MAKE_TOKEN( |
1470 | 75 | _PyTokenizer_syntaxerror( |
1471 | 75 | tok, |
1472 | 75 | "%c-string: newlines are not allowed in format specifiers for single quoted %c-strings", |
1473 | 75 | TOK_GET_STRING_PREFIX(tok), TOK_GET_STRING_PREFIX(tok) |
1474 | 75 | ) |
1475 | 75 | ); |
1476 | 75 | } |
1477 | 0 | tok_backup(tok, c); |
1478 | 0 | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1479 | 0 | current_tok->in_format_spec = 0; |
1480 | 0 | p_start = tok->start; |
1481 | 0 | p_end = tok->cur; |
1482 | 0 | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1483 | 75 | } |
1484 | | |
1485 | 265 | assert(tok->multi_line_start != NULL); |
1486 | | // shift the tok_state's location into |
1487 | | // the start of string, and report the error |
1488 | | // from the initial quote character |
1489 | 265 | tok->cur = (char *)current_tok->start; |
1490 | 265 | tok->cur++; |
1491 | 265 | tok->line_start = current_tok->multi_line_start; |
1492 | 265 | int start = tok->lineno; |
1493 | | |
1494 | 265 | tokenizer_mode *the_current_tok = TOK_GET_MODE(tok); |
1495 | 265 | tok->lineno = the_current_tok->first_line; |
1496 | | |
1497 | 265 | if (current_tok->quote_size == 3) { |
1498 | 28 | _PyTokenizer_syntaxerror(tok, |
1499 | 28 | "unterminated triple-quoted %c-string literal" |
1500 | 28 | " (detected at line %d)", |
1501 | 28 | TOK_GET_STRING_PREFIX(tok), start); |
1502 | 28 | if (c != '\n') { |
1503 | 28 | tok->done = E_EOFS; |
1504 | 28 | } |
1505 | 28 | return MAKE_TOKEN(ERRORTOKEN); |
1506 | 28 | } |
1507 | 237 | else { |
1508 | 237 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1509 | 237 | "unterminated %c-string literal (detected at" |
1510 | 237 | " line %d)", TOK_GET_STRING_PREFIX(tok), start)); |
1511 | 237 | } |
1512 | 265 | } |
1513 | | |
1514 | 152k | if (c == current_tok->quote) { |
1515 | 8.87k | end_quote_size += 1; |
1516 | 8.87k | continue; |
1517 | 143k | } else { |
1518 | 143k | end_quote_size = 0; |
1519 | 143k | } |
1520 | | |
1521 | 143k | if (c == '{') { |
1522 | 16.5k | if (!_PyLexer_update_ftstring_expr(tok, c)) { |
1523 | 0 | return MAKE_TOKEN(ENDMARKER); |
1524 | 0 | } |
1525 | 16.5k | int peek = tok_nextc(tok); |
1526 | 16.5k | if (peek != '{' || in_format_spec) { |
1527 | 13.9k | tok_backup(tok, peek); |
1528 | 13.9k | tok_backup(tok, c); |
1529 | 13.9k | current_tok->curly_bracket_expr_start_depth++; |
1530 | 13.9k | if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { |
1531 | 6 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1532 | 6 | "%c-string: expressions nested too deeply", TOK_GET_STRING_PREFIX(tok))); |
1533 | 6 | } |
1534 | 13.8k | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1535 | 13.8k | current_tok->in_format_spec = 0; |
1536 | 13.8k | p_start = tok->start; |
1537 | 13.8k | p_end = tok->cur; |
1538 | 13.8k | } else { |
1539 | 2.69k | p_start = tok->start; |
1540 | 2.69k | p_end = tok->cur - 1; |
1541 | 2.69k | } |
1542 | 16.5k | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1543 | 127k | } else if (c == '}') { |
1544 | 4.40k | if (unicode_escape) { |
1545 | 392 | p_start = tok->start; |
1546 | 392 | p_end = tok->cur; |
1547 | 392 | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1548 | 392 | } |
1549 | 4.01k | int peek = tok_nextc(tok); |
1550 | | |
1551 | | // The tokenizer can only be in the format spec if we have already completed the expression |
1552 | | // scanning (indicated by the end of the expression being set) and we are not at the top level |
1553 | | // of the bracket stack (-1 is the top level). Since format specifiers can't legally use double |
1554 | | // brackets, we can bypass it here. |
1555 | 4.01k | int cursor = current_tok->curly_bracket_depth; |
1556 | 4.01k | if (peek == '}' && !in_format_spec && cursor == 0) { |
1557 | 1.63k | p_start = tok->start; |
1558 | 1.63k | p_end = tok->cur - 1; |
1559 | 2.38k | } else { |
1560 | 2.38k | tok_backup(tok, peek); |
1561 | 2.38k | tok_backup(tok, c); |
1562 | 2.38k | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1563 | 2.38k | current_tok->in_format_spec = 0; |
1564 | 2.38k | p_start = tok->start; |
1565 | 2.38k | p_end = tok->cur; |
1566 | 2.38k | } |
1567 | 4.01k | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1568 | 122k | } else if (c == '\\') { |
1569 | 6.90k | int peek = tok_nextc(tok); |
1570 | 6.90k | if (peek == '\r') { |
1571 | 69 | peek = tok_nextc(tok); |
1572 | 69 | } |
1573 | | // Special case when the backslash is right before a curly |
1574 | | // brace. We have to restore and return the control back |
1575 | | // to the loop for the next iteration. |
1576 | 6.90k | if (peek == '{' || peek == '}') { |
1577 | 1.13k | if (!current_tok->raw) { |
1578 | 933 | if (_PyTokenizer_warn_invalid_escape_sequence(tok, peek)) { |
1579 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1580 | 0 | } |
1581 | 933 | } |
1582 | 1.13k | tok_backup(tok, peek); |
1583 | 1.13k | continue; |
1584 | 1.13k | } |
1585 | | |
1586 | 5.77k | if (!current_tok->raw) { |
1587 | 5.52k | if (peek == 'N') { |
1588 | | /* Handle named unicode escapes (\N{BULLET}) */ |
1589 | 499 | peek = tok_nextc(tok); |
1590 | 499 | if (peek == '{') { |
1591 | 405 | unicode_escape = 1; |
1592 | 405 | } else { |
1593 | 94 | tok_backup(tok, peek); |
1594 | 94 | } |
1595 | 499 | } |
1596 | 5.52k | } /* else { |
1597 | | skip the escaped character |
1598 | | }*/ |
1599 | 5.77k | } |
1600 | 143k | } |
1601 | | |
1602 | | // Backup the f-string quotes to emit a final FSTRING_MIDDLE and |
1603 | | // add the quotes to the FSTRING_END in the next tokenizer iteration. |
1604 | 11.4k | for (int i = 0; i < current_tok->quote_size; i++) { |
1605 | 6.26k | tok_backup(tok, current_tok->quote); |
1606 | 6.26k | } |
1607 | 5.16k | p_start = tok->start; |
1608 | 5.16k | p_end = tok->cur; |
1609 | 5.16k | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1610 | 26.5k | } |
1611 | | |
1612 | | static int |
1613 | | tok_get(struct tok_state *tok, struct token *token) |
1614 | 1.68M | { |
1615 | 1.68M | tokenizer_mode *current_tok = TOK_GET_MODE(tok); |
1616 | 1.68M | if (current_tok->kind == TOK_REGULAR_MODE) { |
1617 | 1.63M | return tok_get_normal_mode(tok, current_tok, token); |
1618 | 1.63M | } else { |
1619 | 45.7k | return tok_get_fstring_mode(tok, current_tok, token); |
1620 | 45.7k | } |
1621 | 1.68M | } |
1622 | | |
1623 | | int |
1624 | | _PyTokenizer_Get(struct tok_state *tok, struct token *token) |
1625 | 1.68M | { |
1626 | 1.68M | int result = tok_get(tok, token); |
1627 | 1.68M | if (tok->decoding_erred) { |
1628 | 0 | result = ERRORTOKEN; |
1629 | 0 | tok->done = E_DECODE; |
1630 | 0 | } |
1631 | 1.68M | return result; |
1632 | 1.68M | } |