/src/cpython/Parser/lexer/lexer.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "Python.h" |
2 | | #include "pycore_token.h" |
3 | | #include "pycore_unicodeobject.h" |
4 | | #include "errcode.h" |
5 | | |
6 | | #include "state.h" |
7 | | #include "../tokenizer/helpers.h" |
8 | | |
9 | | /* Alternate tab spacing */ |
10 | 9.92k | #define ALTTABSIZE 1 |
11 | | |
12 | 1.76M | #define is_potential_identifier_start(c) (\ |
13 | 1.76M | (c >= 'a' && c <= 'z')\ |
14 | 1.76M | || (c >= 'A' && c <= 'Z')\ |
15 | 1.76M | || c == '_'\ |
16 | 1.76M | || (c >= 128)) |
17 | | |
18 | 2.34M | #define is_potential_identifier_char(c) (\ |
19 | 2.34M | (c >= 'a' && c <= 'z')\ |
20 | 2.34M | || (c >= 'A' && c <= 'Z')\ |
21 | 2.34M | || (c >= '0' && c <= '9')\ |
22 | 2.34M | || c == '_'\ |
23 | 2.34M | || (c >= 128)) |
24 | | |
25 | | #ifdef Py_DEBUG |
26 | | static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) { |
27 | | assert(tok->tok_mode_stack_index >= 0); |
28 | | assert(tok->tok_mode_stack_index < MAXFSTRINGLEVEL); |
29 | | return &(tok->tok_mode_stack[tok->tok_mode_stack_index]); |
30 | | } |
31 | | static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) { |
32 | | assert(tok->tok_mode_stack_index >= 0); |
33 | | assert(tok->tok_mode_stack_index + 1 < MAXFSTRINGLEVEL); |
34 | | return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]); |
35 | | } |
36 | | #else |
37 | 1.89M | #define TOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index])) |
38 | 16.8k | #define TOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index])) |
39 | | #endif |
40 | | |
41 | | #define FTSTRING_MIDDLE(tok_mode) (tok_mode->string_kind == TSTRING ? TSTRING_MIDDLE : FSTRING_MIDDLE) |
42 | | #define FTSTRING_END(tok_mode) (tok_mode->string_kind == TSTRING ? TSTRING_END : FSTRING_END) |
43 | 32 | #define TOK_GET_STRING_PREFIX(tok) (TOK_GET_MODE(tok)->string_kind == TSTRING ? 't' : 'f') |
44 | 1.77M | #define MAKE_TOKEN(token_type) _PyLexer_token_setup(tok, token, token_type, p_start, p_end) |
45 | 0 | #define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\ |
46 | 0 | _PyLexer_type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end)) |
47 | | |
48 | | /* Spaces in this constant are treated as "zero or more spaces or tabs" when |
49 | | tokenizing. */ |
50 | | static const char* type_comment_prefix = "# type: "; |
51 | | |
52 | | static inline int |
53 | | contains_null_bytes(const char* str, size_t size) |
54 | 224k | { |
55 | 224k | return memchr(str, 0, size) != NULL; |
56 | 224k | } |
57 | | |
58 | | /* Get next char, updating state; error code goes into tok->done */ |
59 | | static int |
60 | | tok_nextc(struct tok_state *tok) |
61 | 10.6M | { |
62 | 10.6M | int rc; |
63 | 10.9M | for (;;) { |
64 | 10.9M | if (tok->cur != tok->inp) { |
65 | 10.6M | if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) { |
66 | 0 | tok->done = E_COLUMNOVERFLOW; |
67 | 0 | return EOF; |
68 | 0 | } |
69 | 10.6M | tok->col_offset++; |
70 | 10.6M | return Py_CHARMASK(*tok->cur++); /* Fast path */ |
71 | 10.6M | } |
72 | 277k | if (tok->done != E_OK) { |
73 | 35.2k | return EOF; |
74 | 35.2k | } |
75 | 242k | rc = tok->underflow(tok); |
76 | | #if defined(Py_DEBUG) |
77 | | if (tok->debug) { |
78 | | fprintf(stderr, "line[%d] = ", tok->lineno); |
79 | | _PyTokenizer_print_escape(stderr, tok->cur, tok->inp - tok->cur); |
80 | | fprintf(stderr, " tok->done = %d\n", tok->done); |
81 | | } |
82 | | #endif |
83 | 242k | if (!rc) { |
84 | 17.7k | tok->cur = tok->inp; |
85 | 17.7k | return EOF; |
86 | 17.7k | } |
87 | 224k | tok->line_start = tok->cur; |
88 | | |
89 | 224k | if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) { |
90 | 0 | _PyTokenizer_syntaxerror(tok, "source code cannot contain null bytes"); |
91 | 0 | tok->cur = tok->inp; |
92 | 0 | return EOF; |
93 | 0 | } |
94 | 224k | } |
95 | 10.6M | Py_UNREACHABLE(); |
96 | 10.6M | } |
97 | | |
98 | | /* Back-up one character */ |
99 | | static void |
100 | | tok_backup(struct tok_state *tok, int c) |
101 | 3.73M | { |
102 | 3.73M | if (c != EOF) { |
103 | 3.70M | if (--tok->cur < tok->buf) { |
104 | 0 | Py_FatalError("tokenizer beginning of buffer"); |
105 | 0 | } |
106 | 3.70M | if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) { |
107 | 0 | Py_FatalError("tok_backup: wrong character"); |
108 | 0 | } |
109 | 3.70M | tok->col_offset--; |
110 | 3.70M | } |
111 | 3.73M | } |
112 | | |
113 | | static int |
114 | 22.4k | set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { |
115 | 22.4k | assert(token != NULL); |
116 | 22.4k | assert(c == '}' || c == ':' || c == '!'); |
117 | 22.4k | tokenizer_mode *tok_mode = TOK_GET_MODE(tok); |
118 | | |
119 | 22.4k | if (!(tok_mode->in_debug || tok_mode->string_kind == TSTRING) || token->metadata) { |
120 | 13.7k | return 0; |
121 | 13.7k | } |
122 | 8.70k | PyObject *res = NULL; |
123 | | |
124 | | // Check if there is a # character in the expression |
125 | 8.70k | int hash_detected = 0; |
126 | 1.36M | for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) { |
127 | 1.35M | if (tok_mode->last_expr_buffer[i] == '#') { |
128 | 1.06k | hash_detected = 1; |
129 | 1.06k | break; |
130 | 1.06k | } |
131 | 1.35M | } |
132 | | |
133 | 8.70k | if (hash_detected) { |
134 | 1.06k | Py_ssize_t input_length = tok_mode->last_expr_size - tok_mode->last_expr_end; |
135 | 1.06k | char *result = (char *)PyMem_Malloc((input_length + 1) * sizeof(char)); |
136 | 1.06k | if (!result) { |
137 | 0 | return -1; |
138 | 0 | } |
139 | | |
140 | 1.06k | Py_ssize_t i = 0; |
141 | 1.06k | Py_ssize_t j = 0; |
142 | | |
143 | 39.2k | for (i = 0, j = 0; i < input_length; i++) { |
144 | 38.1k | if (tok_mode->last_expr_buffer[i] == '#') { |
145 | | // Skip characters until newline or end of string |
146 | 20.3k | while (i < input_length && tok_mode->last_expr_buffer[i] != '\0') { |
147 | 19.3k | if (tok_mode->last_expr_buffer[i] == '\n') { |
148 | 288 | result[j++] = tok_mode->last_expr_buffer[i]; |
149 | 288 | break; |
150 | 288 | } |
151 | 19.1k | i++; |
152 | 19.1k | } |
153 | 36.9k | } else { |
154 | 36.9k | result[j++] = tok_mode->last_expr_buffer[i]; |
155 | 36.9k | } |
156 | 38.1k | } |
157 | | |
158 | 1.06k | result[j] = '\0'; // Null-terminate the result string |
159 | 1.06k | res = PyUnicode_DecodeUTF8(result, j, NULL); |
160 | 1.06k | PyMem_Free(result); |
161 | 7.63k | } else { |
162 | 7.63k | res = PyUnicode_DecodeUTF8( |
163 | 7.63k | tok_mode->last_expr_buffer, |
164 | 7.63k | tok_mode->last_expr_size - tok_mode->last_expr_end, |
165 | 7.63k | NULL |
166 | 7.63k | ); |
167 | | |
168 | 7.63k | } |
169 | | |
170 | | |
171 | 8.70k | if (!res) { |
172 | 5 | return -1; |
173 | 5 | } |
174 | 8.69k | token->metadata = res; |
175 | 8.69k | return 0; |
176 | 8.70k | } |
177 | | |
178 | | int |
179 | | _PyLexer_update_ftstring_expr(struct tok_state *tok, char cur) |
180 | 65.9k | { |
181 | 65.9k | assert(tok->cur != NULL); |
182 | | |
183 | 65.9k | Py_ssize_t size = strlen(tok->cur); |
184 | 65.9k | tokenizer_mode *tok_mode = TOK_GET_MODE(tok); |
185 | | |
186 | 65.9k | switch (cur) { |
187 | 0 | case 0: |
188 | 0 | if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) { |
189 | 0 | return 1; |
190 | 0 | } |
191 | 0 | char *new_buffer = PyMem_Realloc( |
192 | 0 | tok_mode->last_expr_buffer, |
193 | 0 | tok_mode->last_expr_size + size |
194 | 0 | ); |
195 | 0 | if (new_buffer == NULL) { |
196 | 0 | PyMem_Free(tok_mode->last_expr_buffer); |
197 | 0 | goto error; |
198 | 0 | } |
199 | 0 | tok_mode->last_expr_buffer = new_buffer; |
200 | 0 | strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size); |
201 | 0 | tok_mode->last_expr_size += size; |
202 | 0 | break; |
203 | 43.5k | case '{': |
204 | 43.5k | if (tok_mode->last_expr_buffer != NULL) { |
205 | 33.1k | PyMem_Free(tok_mode->last_expr_buffer); |
206 | 33.1k | } |
207 | 43.5k | tok_mode->last_expr_buffer = PyMem_Malloc(size); |
208 | 43.5k | if (tok_mode->last_expr_buffer == NULL) { |
209 | 0 | goto error; |
210 | 0 | } |
211 | 43.5k | tok_mode->last_expr_size = size; |
212 | 43.5k | tok_mode->last_expr_end = -1; |
213 | 43.5k | strncpy(tok_mode->last_expr_buffer, tok->cur, size); |
214 | 43.5k | break; |
215 | 18.3k | case '}': |
216 | 19.8k | case '!': |
217 | 19.8k | tok_mode->last_expr_end = strlen(tok->start); |
218 | 19.8k | break; |
219 | 2.56k | case ':': |
220 | 2.56k | if (tok_mode->last_expr_end == -1) { |
221 | 2.33k | tok_mode->last_expr_end = strlen(tok->start); |
222 | 2.33k | } |
223 | 2.56k | break; |
224 | 0 | default: |
225 | 0 | Py_UNREACHABLE(); |
226 | 65.9k | } |
227 | 65.9k | return 1; |
228 | 0 | error: |
229 | 0 | tok->done = E_NOMEM; |
230 | 0 | return 0; |
231 | 65.9k | } |
232 | | |
233 | | static int |
234 | | lookahead(struct tok_state *tok, const char *test) |
235 | 8.92k | { |
236 | 8.92k | const char *s = test; |
237 | 8.92k | int res = 0; |
238 | 23.4k | while (1) { |
239 | 23.4k | int c = tok_nextc(tok); |
240 | 23.4k | if (*s == 0) { |
241 | 8.82k | res = !is_potential_identifier_char(c); |
242 | 8.82k | } |
243 | 14.6k | else if (c == *s) { |
244 | 14.5k | s++; |
245 | 14.5k | continue; |
246 | 14.5k | } |
247 | | |
248 | 8.92k | tok_backup(tok, c); |
249 | 23.4k | while (s != test) { |
250 | 14.5k | tok_backup(tok, *--s); |
251 | 14.5k | } |
252 | 8.92k | return res; |
253 | 23.4k | } |
254 | 8.92k | } |
255 | | |
256 | | static int |
257 | 98.6k | verify_end_of_number(struct tok_state *tok, int c, const char *kind) { |
258 | 98.6k | if (tok->tok_extra_tokens) { |
259 | | // When we are parsing extra tokens, we don't want to emit warnings |
260 | | // about invalid literals, because we want to be a bit more liberal. |
261 | 0 | return 1; |
262 | 0 | } |
263 | | /* Emit a deprecation warning only if the numeric literal is immediately |
264 | | * followed by one of keywords which can occur after a numeric literal |
265 | | * in valid code: "and", "else", "for", "if", "in", "is" and "or". |
266 | | * It allows to gradually deprecate existing valid code without adding |
267 | | * warning before error in most cases of invalid numeric literal (which |
268 | | * would be confusing and break existing tests). |
269 | | * Raise a syntax error with slightly better message than plain |
270 | | * "invalid syntax" if the numeric literal is immediately followed by |
271 | | * other keyword or identifier. |
272 | | */ |
273 | 98.6k | int r = 0; |
274 | 98.6k | if (c == 'a') { |
275 | 838 | r = lookahead(tok, "nd"); |
276 | 838 | } |
277 | 97.7k | else if (c == 'e') { |
278 | 469 | r = lookahead(tok, "lse"); |
279 | 469 | } |
280 | 97.3k | else if (c == 'f') { |
281 | 3.55k | r = lookahead(tok, "or"); |
282 | 3.55k | } |
283 | 93.7k | else if (c == 'i') { |
284 | 1.90k | int c2 = tok_nextc(tok); |
285 | 1.90k | if (c2 == 'f' || c2 == 'n' || c2 == 's') { |
286 | 1.88k | r = 1; |
287 | 1.88k | } |
288 | 1.90k | tok_backup(tok, c2); |
289 | 1.90k | } |
290 | 91.8k | else if (c == 'o') { |
291 | 3.54k | r = lookahead(tok, "r"); |
292 | 3.54k | } |
293 | 88.3k | else if (c == 'n') { |
294 | 520 | r = lookahead(tok, "ot"); |
295 | 520 | } |
296 | 98.6k | if (r) { |
297 | 10.6k | tok_backup(tok, c); |
298 | 10.6k | if (_PyTokenizer_parser_warn(tok, PyExc_SyntaxWarning, |
299 | 10.6k | "invalid %s literal", kind)) |
300 | 0 | { |
301 | 0 | return 0; |
302 | 0 | } |
303 | 10.6k | tok_nextc(tok); |
304 | 10.6k | } |
305 | 87.9k | else /* In future releases, only error will remain. */ |
306 | 87.9k | if (c < 128 && is_potential_identifier_char(c)) { |
307 | 213 | tok_backup(tok, c); |
308 | 213 | _PyTokenizer_syntaxerror(tok, "invalid %s literal", kind); |
309 | 213 | return 0; |
310 | 213 | } |
311 | 98.4k | return 1; |
312 | 98.6k | } |
313 | | |
314 | | /* Verify that the identifier follows PEP 3131. */ |
315 | | static int |
316 | | verify_identifier(struct tok_state *tok) |
317 | 14.2k | { |
318 | 14.2k | if (tok->tok_extra_tokens) { |
319 | 0 | return 1; |
320 | 0 | } |
321 | 14.2k | PyObject *s; |
322 | 14.2k | if (tok->decoding_erred) |
323 | 0 | return 0; |
324 | 14.2k | s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); |
325 | 14.2k | if (s == NULL) { |
326 | 1.01k | if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { |
327 | 1.01k | tok->done = E_DECODE; |
328 | 1.01k | } |
329 | 0 | else { |
330 | 0 | tok->done = E_ERROR; |
331 | 0 | } |
332 | 1.01k | return 0; |
333 | 1.01k | } |
334 | 13.2k | Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s); |
335 | 13.2k | assert(invalid >= 0); |
336 | 13.2k | assert(PyUnicode_GET_LENGTH(s) > 0); |
337 | 13.2k | if (invalid < PyUnicode_GET_LENGTH(s)) { |
338 | 631 | Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid); |
339 | 631 | if (invalid + 1 < PyUnicode_GET_LENGTH(s)) { |
340 | | /* Determine the offset in UTF-8 encoded input */ |
341 | 430 | Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1)); |
342 | 430 | if (s != NULL) { |
343 | 430 | Py_SETREF(s, PyUnicode_AsUTF8String(s)); |
344 | 430 | } |
345 | 430 | if (s == NULL) { |
346 | 0 | tok->done = E_ERROR; |
347 | 0 | return 0; |
348 | 0 | } |
349 | 430 | tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s); |
350 | 430 | } |
351 | 631 | Py_DECREF(s); |
352 | 631 | if (Py_UNICODE_ISPRINTABLE(ch)) { |
353 | 361 | _PyTokenizer_syntaxerror(tok, "invalid character '%c' (U+%04X)", ch, ch); |
354 | 361 | } |
355 | 270 | else { |
356 | 270 | _PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", ch); |
357 | 270 | } |
358 | 631 | return 0; |
359 | 631 | } |
360 | 12.5k | Py_DECREF(s); |
361 | 12.5k | return 1; |
362 | 13.2k | } |
363 | | |
364 | | static int |
365 | | tok_decimal_tail(struct tok_state *tok) |
366 | 80.0k | { |
367 | 80.0k | int c; |
368 | | |
369 | 80.6k | while (1) { |
370 | 224k | do { |
371 | 224k | c = tok_nextc(tok); |
372 | 224k | } while (Py_ISDIGIT(c)); |
373 | 80.6k | if (c != '_') { |
374 | 80.0k | break; |
375 | 80.0k | } |
376 | 562 | c = tok_nextc(tok); |
377 | 562 | if (!Py_ISDIGIT(c)) { |
378 | 15 | tok_backup(tok, c); |
379 | 15 | _PyTokenizer_syntaxerror(tok, "invalid decimal literal"); |
380 | 15 | return 0; |
381 | 15 | } |
382 | 562 | } |
383 | 80.0k | return c; |
384 | 80.0k | } |
385 | | |
386 | | static inline int |
387 | 1.20k | tok_continuation_line(struct tok_state *tok) { |
388 | 1.20k | int c = tok_nextc(tok); |
389 | 1.20k | if (c == '\r') { |
390 | 68 | c = tok_nextc(tok); |
391 | 68 | } |
392 | 1.20k | if (c != '\n') { |
393 | 66 | tok->done = E_LINECONT; |
394 | 66 | return -1; |
395 | 66 | } |
396 | 1.14k | c = tok_nextc(tok); |
397 | 1.14k | if (c == EOF) { |
398 | 36 | tok->done = E_EOF; |
399 | 36 | tok->cur = tok->inp; |
400 | 36 | return -1; |
401 | 1.10k | } else { |
402 | 1.10k | tok_backup(tok, c); |
403 | 1.10k | } |
404 | 1.10k | return c; |
405 | 1.14k | } |
406 | | |
407 | | static int |
408 | | maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok, |
409 | | int saw_b, int saw_r, int saw_u, |
410 | 22.0k | int saw_f, int saw_t) { |
411 | | // Supported: rb, rf, rt (in any order) |
412 | | // Unsupported: ub, ur, uf, ut, bf, bt, ft (in any order) |
413 | | |
414 | 22.0k | #define RETURN_SYNTAX_ERROR(PREFIX1, PREFIX2) \ |
415 | 22.0k | do { \ |
416 | 8 | (void)_PyTokenizer_syntaxerror_known_range( \ |
417 | 8 | tok, (int)(tok->start + 1 - tok->line_start), \ |
418 | 8 | (int)(tok->cur - tok->line_start), \ |
419 | 8 | "'" PREFIX1 "' and '" PREFIX2 "' prefixes are incompatible"); \ |
420 | 8 | return -1; \ |
421 | 8 | } while (0) |
422 | | |
423 | 22.0k | if (saw_u && saw_b) { |
424 | 1 | RETURN_SYNTAX_ERROR("u", "b"); |
425 | 1 | } |
426 | 22.0k | if (saw_u && saw_r) { |
427 | 1 | RETURN_SYNTAX_ERROR("u", "r"); |
428 | 1 | } |
429 | 22.0k | if (saw_u && saw_f) { |
430 | 1 | RETURN_SYNTAX_ERROR("u", "f"); |
431 | 1 | } |
432 | 22.0k | if (saw_u && saw_t) { |
433 | 1 | RETURN_SYNTAX_ERROR("u", "t"); |
434 | 1 | } |
435 | | |
436 | 22.0k | if (saw_b && saw_f) { |
437 | 2 | RETURN_SYNTAX_ERROR("b", "f"); |
438 | 2 | } |
439 | 22.0k | if (saw_b && saw_t) { |
440 | 1 | RETURN_SYNTAX_ERROR("b", "t"); |
441 | 1 | } |
442 | | |
443 | 22.0k | if (saw_f && saw_t) { |
444 | 1 | RETURN_SYNTAX_ERROR("f", "t"); |
445 | 1 | } |
446 | | |
447 | 22.0k | #undef RETURN_SYNTAX_ERROR |
448 | | |
449 | 22.0k | return 0; |
450 | 22.0k | } |
451 | | |
452 | | static int |
453 | | tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token) |
454 | 1.73M | { |
455 | 1.73M | int c; |
456 | 1.73M | int blankline, nonascii; |
457 | | |
458 | 1.73M | const char *p_start = NULL; |
459 | 1.73M | const char *p_end = NULL; |
460 | 1.81M | nextline: |
461 | 1.81M | tok->start = NULL; |
462 | 1.81M | tok->starting_col_offset = -1; |
463 | 1.81M | blankline = 0; |
464 | | |
465 | | |
466 | | /* Get indentation level */ |
467 | 1.81M | if (tok->atbol) { |
468 | 225k | int col = 0; |
469 | 225k | int altcol = 0; |
470 | 225k | tok->atbol = 0; |
471 | 225k | int cont_line_col = 0; |
472 | 839k | for (;;) { |
473 | 839k | c = tok_nextc(tok); |
474 | 839k | if (c == ' ') { |
475 | 606k | col++, altcol++; |
476 | 606k | } |
477 | 233k | else if (c == '\t') { |
478 | 4.96k | col = (col / tok->tabsize + 1) * tok->tabsize; |
479 | 4.96k | altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE; |
480 | 4.96k | } |
481 | 228k | else if (c == '\014') {/* Control-L (formfeed) */ |
482 | 2.11k | col = altcol = 0; /* For Emacs users */ |
483 | 2.11k | } |
484 | 226k | else if (c == '\\') { |
485 | | // Indentation cannot be split over multiple physical lines |
486 | | // using backslashes. This means that if we found a backslash |
487 | | // preceded by whitespace, **the first one we find** determines |
488 | | // the level of indentation of whatever comes next. |
489 | 782 | cont_line_col = cont_line_col ? cont_line_col : col; |
490 | 782 | if ((c = tok_continuation_line(tok)) == -1) { |
491 | 30 | return MAKE_TOKEN(ERRORTOKEN); |
492 | 30 | } |
493 | 782 | } |
494 | 225k | else { |
495 | 225k | break; |
496 | 225k | } |
497 | 839k | } |
498 | 225k | tok_backup(tok, c); |
499 | 225k | if (c == '#' || c == '\n' || c == '\r') { |
500 | | /* Lines with only whitespace and/or comments |
501 | | shouldn't affect the indentation and are |
502 | | not passed to the parser as NEWLINE tokens, |
503 | | except *totally* empty lines in interactive |
504 | | mode, which signal the end of a command group. */ |
505 | 44.6k | if (col == 0 && c == '\n' && tok->prompt != NULL) { |
506 | 0 | blankline = 0; /* Let it through */ |
507 | 0 | } |
508 | 44.6k | else if (tok->prompt != NULL && tok->lineno == 1) { |
509 | | /* In interactive mode, if the first line contains |
510 | | only spaces and/or a comment, let it through. */ |
511 | 0 | blankline = 0; |
512 | 0 | col = altcol = 0; |
513 | 0 | } |
514 | 44.6k | else { |
515 | 44.6k | blankline = 1; /* Ignore completely */ |
516 | 44.6k | } |
517 | | /* We can't jump back right here since we still |
518 | | may need to skip to the end of a comment */ |
519 | 44.6k | } |
520 | 225k | if (!blankline && tok->level == 0) { |
521 | 141k | col = cont_line_col ? cont_line_col : col; |
522 | 141k | altcol = cont_line_col ? cont_line_col : altcol; |
523 | 141k | if (col == tok->indstack[tok->indent]) { |
524 | | /* No change */ |
525 | 107k | if (altcol != tok->altindstack[tok->indent]) { |
526 | 1 | return MAKE_TOKEN(_PyTokenizer_indenterror(tok)); |
527 | 1 | } |
528 | 107k | } |
529 | 34.1k | else if (col > tok->indstack[tok->indent]) { |
530 | | /* Indent -- always one */ |
531 | 19.1k | if (tok->indent+1 >= MAXINDENT) { |
532 | 0 | tok->done = E_TOODEEP; |
533 | 0 | tok->cur = tok->inp; |
534 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
535 | 0 | } |
536 | 19.1k | if (altcol <= tok->altindstack[tok->indent]) { |
537 | 3 | return MAKE_TOKEN(_PyTokenizer_indenterror(tok)); |
538 | 3 | } |
539 | 19.1k | tok->pendin++; |
540 | 19.1k | tok->indstack[++tok->indent] = col; |
541 | 19.1k | tok->altindstack[tok->indent] = altcol; |
542 | 19.1k | } |
543 | 14.9k | else /* col < tok->indstack[tok->indent] */ { |
544 | | /* Dedent -- any number, must be consistent */ |
545 | 33.3k | while (tok->indent > 0 && |
546 | 33.3k | col < tok->indstack[tok->indent]) { |
547 | 18.3k | tok->pendin--; |
548 | 18.3k | tok->indent--; |
549 | 18.3k | } |
550 | 14.9k | if (col != tok->indstack[tok->indent]) { |
551 | 8 | tok->done = E_DEDENT; |
552 | 8 | tok->cur = tok->inp; |
553 | 8 | return MAKE_TOKEN(ERRORTOKEN); |
554 | 8 | } |
555 | 14.9k | if (altcol != tok->altindstack[tok->indent]) { |
556 | 1 | return MAKE_TOKEN(_PyTokenizer_indenterror(tok)); |
557 | 1 | } |
558 | 14.9k | } |
559 | 141k | } |
560 | 225k | } |
561 | | |
562 | 1.81M | tok->start = tok->cur; |
563 | 1.81M | tok->starting_col_offset = tok->col_offset; |
564 | | |
565 | | /* Return pending indents/dedents */ |
566 | 1.81M | if (tok->pendin != 0) { |
567 | 37.5k | if (tok->pendin < 0) { |
568 | 18.3k | if (tok->tok_extra_tokens) { |
569 | 0 | p_start = tok->cur; |
570 | 0 | p_end = tok->cur; |
571 | 0 | } |
572 | 18.3k | tok->pendin++; |
573 | 18.3k | return MAKE_TOKEN(DEDENT); |
574 | 18.3k | } |
575 | 19.1k | else { |
576 | 19.1k | if (tok->tok_extra_tokens) { |
577 | 0 | p_start = tok->buf; |
578 | 0 | p_end = tok->cur; |
579 | 0 | } |
580 | 19.1k | tok->pendin--; |
581 | 19.1k | return MAKE_TOKEN(INDENT); |
582 | 19.1k | } |
583 | 37.5k | } |
584 | | |
585 | | /* Peek ahead at the next character */ |
586 | 1.77M | c = tok_nextc(tok); |
587 | 1.77M | tok_backup(tok, c); |
588 | | |
589 | 1.77M | again: |
590 | 1.77M | tok->start = NULL; |
591 | | /* Skip spaces */ |
592 | 2.11M | do { |
593 | 2.11M | c = tok_nextc(tok); |
594 | 2.11M | } while (c == ' ' || c == '\t' || c == '\014'); |
595 | | |
596 | | /* Set start of current token */ |
597 | 1.77M | tok->start = tok->cur == NULL ? NULL : tok->cur - 1; |
598 | 1.77M | tok->starting_col_offset = tok->col_offset - 1; |
599 | | |
600 | | /* Skip comment, unless it's a type comment */ |
601 | 1.77M | if (c == '#') { |
602 | | |
603 | 42.0k | const char* p = NULL; |
604 | 42.0k | const char *prefix, *type_start; |
605 | 42.0k | int current_starting_col_offset; |
606 | | |
607 | 1.26M | while (c != EOF && c != '\n' && c != '\r') { |
608 | 1.22M | c = tok_nextc(tok); |
609 | 1.22M | } |
610 | | |
611 | 42.0k | if (tok->tok_extra_tokens) { |
612 | 0 | p = tok->start; |
613 | 0 | } |
614 | | |
615 | 42.0k | if (tok->type_comments) { |
616 | 0 | p = tok->start; |
617 | 0 | current_starting_col_offset = tok->starting_col_offset; |
618 | 0 | prefix = type_comment_prefix; |
619 | 0 | while (*prefix && p < tok->cur) { |
620 | 0 | if (*prefix == ' ') { |
621 | 0 | while (*p == ' ' || *p == '\t') { |
622 | 0 | p++; |
623 | 0 | current_starting_col_offset++; |
624 | 0 | } |
625 | 0 | } else if (*prefix == *p) { |
626 | 0 | p++; |
627 | 0 | current_starting_col_offset++; |
628 | 0 | } else { |
629 | 0 | break; |
630 | 0 | } |
631 | | |
632 | 0 | prefix++; |
633 | 0 | } |
634 | | |
635 | | /* This is a type comment if we matched all of type_comment_prefix. */ |
636 | 0 | if (!*prefix) { |
637 | 0 | int is_type_ignore = 1; |
638 | | // +6 in order to skip the word 'ignore' |
639 | 0 | const char *ignore_end = p + 6; |
640 | 0 | const int ignore_end_col_offset = current_starting_col_offset + 6; |
641 | 0 | tok_backup(tok, c); /* don't eat the newline or EOF */ |
642 | |
|
643 | 0 | type_start = p; |
644 | | |
645 | | /* A TYPE_IGNORE is "type: ignore" followed by the end of the token |
646 | | * or anything ASCII and non-alphanumeric. */ |
647 | 0 | is_type_ignore = ( |
648 | 0 | tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0 |
649 | 0 | && !(tok->cur > ignore_end |
650 | 0 | && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0])))); |
651 | |
|
652 | 0 | if (is_type_ignore) { |
653 | 0 | p_start = ignore_end; |
654 | 0 | p_end = tok->cur; |
655 | | |
656 | | /* If this type ignore is the only thing on the line, consume the newline also. */ |
657 | 0 | if (blankline) { |
658 | 0 | tok_nextc(tok); |
659 | 0 | tok->atbol = 1; |
660 | 0 | } |
661 | 0 | return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset); |
662 | 0 | } else { |
663 | 0 | p_start = type_start; |
664 | 0 | p_end = tok->cur; |
665 | 0 | return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset); |
666 | 0 | } |
667 | 0 | } |
668 | 0 | } |
669 | 42.0k | if (tok->tok_extra_tokens) { |
670 | 0 | tok_backup(tok, c); /* don't eat the newline or EOF */ |
671 | 0 | p_start = p; |
672 | 0 | p_end = tok->cur; |
673 | 0 | tok->comment_newline = blankline; |
674 | 0 | return MAKE_TOKEN(COMMENT); |
675 | 0 | } |
676 | 42.0k | } |
677 | | |
678 | 1.77M | if (tok->done == E_INTERACT_STOP) { |
679 | 0 | return MAKE_TOKEN(ENDMARKER); |
680 | 0 | } |
681 | | |
682 | | /* Check for EOF and errors now */ |
683 | 1.77M | if (c == EOF) { |
684 | 17.6k | if (tok->level) { |
685 | 4.31k | return MAKE_TOKEN(ERRORTOKEN); |
686 | 4.31k | } |
687 | 13.2k | return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN); |
688 | 17.6k | } |
689 | | |
690 | | /* Identifier (most frequent token!) */ |
691 | 1.76M | nonascii = 0; |
692 | 1.76M | if (is_potential_identifier_start(c)) { |
693 | | /* Process the various legal combinations of b"", r"", u"", and f"". */ |
694 | 531k | int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0, saw_t = 0; |
695 | 652k | while (1) { |
696 | 652k | if (!saw_b && (c == 'b' || c == 'B')) { |
697 | 21.1k | saw_b = 1; |
698 | 21.1k | } |
699 | | /* Since this is a backwards compatibility support literal we don't |
700 | | want to support it in arbitrary order like byte literals. */ |
701 | 631k | else if (!saw_u && (c == 'u'|| c == 'U')) { |
702 | 6.45k | saw_u = 1; |
703 | 6.45k | } |
704 | | /* ur"" and ru"" are not supported */ |
705 | 625k | else if (!saw_r && (c == 'r' || c == 'R')) { |
706 | 37.2k | saw_r = 1; |
707 | 37.2k | } |
708 | 587k | else if (!saw_f && (c == 'f' || c == 'F')) { |
709 | 47.0k | saw_f = 1; |
710 | 47.0k | } |
711 | 540k | else if (!saw_t && (c == 't' || c == 'T')) { |
712 | 31.6k | saw_t = 1; |
713 | 31.6k | } |
714 | 509k | else { |
715 | 509k | break; |
716 | 509k | } |
717 | 143k | c = tok_nextc(tok); |
718 | 143k | if (c == '"' || c == '\'') { |
719 | | // Raise error on incompatible string prefixes: |
720 | 22.0k | int status = maybe_raise_syntax_error_for_string_prefixes( |
721 | 22.0k | tok, saw_b, saw_r, saw_u, saw_f, saw_t); |
722 | 22.0k | if (status < 0) { |
723 | 8 | return MAKE_TOKEN(ERRORTOKEN); |
724 | 8 | } |
725 | | |
726 | | // Handle valid f or t string creation: |
727 | 22.0k | if (saw_f || saw_t) { |
728 | 16.8k | goto f_string_quote; |
729 | 16.8k | } |
730 | 5.20k | goto letter_quote; |
731 | 22.0k | } |
732 | 143k | } |
733 | 2.25M | while (is_potential_identifier_char(c)) { |
734 | 1.74M | if (c >= 128) { |
735 | 198k | nonascii = 1; |
736 | 198k | } |
737 | 1.74M | c = tok_nextc(tok); |
738 | 1.74M | } |
739 | 509k | tok_backup(tok, c); |
740 | 509k | if (nonascii && !verify_identifier(tok)) { |
741 | 1.64k | return MAKE_TOKEN(ERRORTOKEN); |
742 | 1.64k | } |
743 | | |
744 | 507k | p_start = tok->start; |
745 | 507k | p_end = tok->cur; |
746 | | |
747 | 507k | return MAKE_TOKEN(NAME); |
748 | 509k | } |
749 | | |
750 | 1.23M | if (c == '\r') { |
751 | 436 | c = tok_nextc(tok); |
752 | 436 | } |
753 | | |
754 | | /* Newline */ |
755 | 1.23M | if (c == '\n') { |
756 | 203k | tok->atbol = 1; |
757 | 203k | if (blankline || tok->level > 0) { |
758 | 83.8k | if (tok->tok_extra_tokens) { |
759 | 0 | if (tok->comment_newline) { |
760 | 0 | tok->comment_newline = 0; |
761 | 0 | } |
762 | 0 | p_start = tok->start; |
763 | 0 | p_end = tok->cur; |
764 | 0 | return MAKE_TOKEN(NL); |
765 | 0 | } |
766 | 83.8k | goto nextline; |
767 | 83.8k | } |
768 | 120k | if (tok->comment_newline && tok->tok_extra_tokens) { |
769 | 0 | tok->comment_newline = 0; |
770 | 0 | p_start = tok->start; |
771 | 0 | p_end = tok->cur; |
772 | 0 | return MAKE_TOKEN(NL); |
773 | 0 | } |
774 | 120k | p_start = tok->start; |
775 | 120k | p_end = tok->cur - 1; /* Leave '\n' out of the string */ |
776 | 120k | tok->cont_line = 0; |
777 | 120k | return MAKE_TOKEN(NEWLINE); |
778 | 120k | } |
779 | | |
780 | | /* Period or number starting with period? */ |
781 | 1.02M | if (c == '.') { |
782 | 31.8k | c = tok_nextc(tok); |
783 | 31.8k | if (Py_ISDIGIT(c)) { |
784 | 3.27k | goto fraction; |
785 | 28.5k | } else if (c == '.') { |
786 | 3.49k | c = tok_nextc(tok); |
787 | 3.49k | if (c == '.') { |
788 | 2.86k | p_start = tok->start; |
789 | 2.86k | p_end = tok->cur; |
790 | 2.86k | return MAKE_TOKEN(ELLIPSIS); |
791 | 2.86k | } |
792 | 634 | else { |
793 | 634 | tok_backup(tok, c); |
794 | 634 | } |
795 | 634 | tok_backup(tok, '.'); |
796 | 634 | } |
797 | 25.0k | else { |
798 | 25.0k | tok_backup(tok, c); |
799 | 25.0k | } |
800 | 25.7k | p_start = tok->start; |
801 | 25.7k | p_end = tok->cur; |
802 | 25.7k | return MAKE_TOKEN(DOT); |
803 | 31.8k | } |
804 | | |
805 | | /* Number */ |
806 | 995k | if (Py_ISDIGIT(c)) { |
807 | 95.4k | if (c == '0') { |
808 | | /* Hex, octal or binary -- maybe. */ |
809 | 32.6k | c = tok_nextc(tok); |
810 | 32.6k | if (c == 'x' || c == 'X') { |
811 | | /* Hex */ |
812 | 15.9k | c = tok_nextc(tok); |
813 | 16.1k | do { |
814 | 16.1k | if (c == '_') { |
815 | 231 | c = tok_nextc(tok); |
816 | 231 | } |
817 | 16.1k | if (!Py_ISXDIGIT(c)) { |
818 | 19 | tok_backup(tok, c); |
819 | 19 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid hexadecimal literal")); |
820 | 19 | } |
821 | 79.3k | do { |
822 | 79.3k | c = tok_nextc(tok); |
823 | 79.3k | } while (Py_ISXDIGIT(c)); |
824 | 16.1k | } while (c == '_'); |
825 | 15.9k | if (!verify_end_of_number(tok, c, "hexadecimal")) { |
826 | 3 | return MAKE_TOKEN(ERRORTOKEN); |
827 | 3 | } |
828 | 15.9k | } |
829 | 16.6k | else if (c == 'o' || c == 'O') { |
830 | | /* Octal */ |
831 | 742 | c = tok_nextc(tok); |
832 | 1.43k | do { |
833 | 1.43k | if (c == '_') { |
834 | 695 | c = tok_nextc(tok); |
835 | 695 | } |
836 | 1.43k | if (c < '0' || c >= '8') { |
837 | 27 | if (Py_ISDIGIT(c)) { |
838 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
839 | 1 | "invalid digit '%c' in octal literal", c)); |
840 | 1 | } |
841 | 26 | else { |
842 | 26 | tok_backup(tok, c); |
843 | 26 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid octal literal")); |
844 | 26 | } |
845 | 27 | } |
846 | 4.61k | do { |
847 | 4.61k | c = tok_nextc(tok); |
848 | 4.61k | } while ('0' <= c && c < '8'); |
849 | 1.40k | } while (c == '_'); |
850 | 715 | if (Py_ISDIGIT(c)) { |
851 | 2 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
852 | 2 | "invalid digit '%c' in octal literal", c)); |
853 | 2 | } |
854 | 713 | if (!verify_end_of_number(tok, c, "octal")) { |
855 | 2 | return MAKE_TOKEN(ERRORTOKEN); |
856 | 2 | } |
857 | 713 | } |
858 | 15.9k | else if (c == 'b' || c == 'B') { |
859 | | /* Binary */ |
860 | 591 | c = tok_nextc(tok); |
861 | 903 | do { |
862 | 903 | if (c == '_') { |
863 | 320 | c = tok_nextc(tok); |
864 | 320 | } |
865 | 903 | if (c != '0' && c != '1') { |
866 | 29 | if (Py_ISDIGIT(c)) { |
867 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid digit '%c' in binary literal", c)); |
868 | 1 | } |
869 | 28 | else { |
870 | 28 | tok_backup(tok, c); |
871 | 28 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid binary literal")); |
872 | 28 | } |
873 | 29 | } |
874 | 4.41k | do { |
875 | 4.41k | c = tok_nextc(tok); |
876 | 4.41k | } while (c == '0' || c == '1'); |
877 | 874 | } while (c == '_'); |
878 | 562 | if (Py_ISDIGIT(c)) { |
879 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid digit '%c' in binary literal", c)); |
880 | 1 | } |
881 | 561 | if (!verify_end_of_number(tok, c, "binary")) { |
882 | 1 | return MAKE_TOKEN(ERRORTOKEN); |
883 | 1 | } |
884 | 561 | } |
885 | 15.3k | else { |
886 | 15.3k | int nonzero = 0; |
887 | | /* maybe old-style octal; c is first char of it */ |
888 | | /* in any case, allow '0' as a literal */ |
889 | 17.0k | while (1) { |
890 | 17.0k | if (c == '_') { |
891 | 209 | c = tok_nextc(tok); |
892 | 209 | if (!Py_ISDIGIT(c)) { |
893 | 3 | tok_backup(tok, c); |
894 | 3 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal")); |
895 | 3 | } |
896 | 209 | } |
897 | 17.0k | if (c != '0') { |
898 | 15.3k | break; |
899 | 15.3k | } |
900 | 1.71k | c = tok_nextc(tok); |
901 | 1.71k | } |
902 | 15.3k | char* zeros_end = tok->cur; |
903 | 15.3k | if (Py_ISDIGIT(c)) { |
904 | 499 | nonzero = 1; |
905 | 499 | c = tok_decimal_tail(tok); |
906 | 499 | if (c == 0) { |
907 | 3 | return MAKE_TOKEN(ERRORTOKEN); |
908 | 3 | } |
909 | 499 | } |
910 | 15.3k | if (c == '.') { |
911 | 792 | c = tok_nextc(tok); |
912 | 792 | goto fraction; |
913 | 792 | } |
914 | 14.5k | else if (c == 'e' || c == 'E') { |
915 | 963 | goto exponent; |
916 | 963 | } |
917 | 13.5k | else if (c == 'j' || c == 'J') { |
918 | 810 | goto imaginary; |
919 | 810 | } |
920 | 12.7k | else if (nonzero && !tok->tok_extra_tokens) { |
921 | | /* Old-style octal: now disallowed. */ |
922 | 24 | tok_backup(tok, c); |
923 | 24 | return MAKE_TOKEN(_PyTokenizer_syntaxerror_known_range( |
924 | 24 | tok, (int)(tok->start + 1 - tok->line_start), |
925 | 24 | (int)(zeros_end - tok->line_start), |
926 | 24 | "leading zeros in decimal integer " |
927 | 24 | "literals are not permitted; " |
928 | 24 | "use an 0o prefix for octal integers")); |
929 | 24 | } |
930 | 12.7k | if (!verify_end_of_number(tok, c, "decimal")) { |
931 | 33 | return MAKE_TOKEN(ERRORTOKEN); |
932 | 33 | } |
933 | 12.7k | } |
934 | 32.6k | } |
935 | 62.8k | else { |
936 | | /* Decimal */ |
937 | 62.8k | c = tok_decimal_tail(tok); |
938 | 62.8k | if (c == 0) { |
939 | 10 | return MAKE_TOKEN(ERRORTOKEN); |
940 | 10 | } |
941 | 62.8k | { |
942 | | /* Accept floating-point numbers. */ |
943 | 62.8k | if (c == '.') { |
944 | 3.81k | c = tok_nextc(tok); |
945 | 7.87k | fraction: |
946 | | /* Fraction */ |
947 | 7.87k | if (Py_ISDIGIT(c)) { |
948 | 6.18k | c = tok_decimal_tail(tok); |
949 | 6.18k | if (c == 0) { |
950 | 1 | return MAKE_TOKEN(ERRORTOKEN); |
951 | 1 | } |
952 | 6.18k | } |
953 | 7.87k | } |
954 | 66.9k | if (c == 'e' || c == 'E') { |
955 | 10.0k | int e; |
956 | 11.0k | exponent: |
957 | 11.0k | e = c; |
958 | | /* Exponent part */ |
959 | 11.0k | c = tok_nextc(tok); |
960 | 11.0k | if (c == '+' || c == '-') { |
961 | 3.55k | c = tok_nextc(tok); |
962 | 3.55k | if (!Py_ISDIGIT(c)) { |
963 | 9 | tok_backup(tok, c); |
964 | 9 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal")); |
965 | 9 | } |
966 | 7.48k | } else if (!Py_ISDIGIT(c)) { |
967 | 478 | tok_backup(tok, c); |
968 | 478 | if (!verify_end_of_number(tok, e, "decimal")) { |
969 | 50 | return MAKE_TOKEN(ERRORTOKEN); |
970 | 50 | } |
971 | 428 | tok_backup(tok, e); |
972 | 428 | p_start = tok->start; |
973 | 428 | p_end = tok->cur; |
974 | 428 | return MAKE_TOKEN(NUMBER); |
975 | 478 | } |
976 | 10.5k | c = tok_decimal_tail(tok); |
977 | 10.5k | if (c == 0) { |
978 | 1 | return MAKE_TOKEN(ERRORTOKEN); |
979 | 1 | } |
980 | 10.5k | } |
981 | 67.3k | if (c == 'j' || c == 'J') { |
982 | | /* Imaginary part */ |
983 | 4.48k | imaginary: |
984 | 4.48k | c = tok_nextc(tok); |
985 | 4.48k | if (!verify_end_of_number(tok, c, "imaginary")) { |
986 | 10 | return MAKE_TOKEN(ERRORTOKEN); |
987 | 10 | } |
988 | 4.48k | } |
989 | 63.6k | else if (!verify_end_of_number(tok, c, "decimal")) { |
990 | 114 | return MAKE_TOKEN(ERRORTOKEN); |
991 | 114 | } |
992 | 67.3k | } |
993 | 67.3k | } |
994 | 97.9k | tok_backup(tok, c); |
995 | 97.9k | p_start = tok->start; |
996 | 97.9k | p_end = tok->cur; |
997 | 97.9k | return MAKE_TOKEN(NUMBER); |
998 | 95.4k | } |
999 | | |
1000 | 916k | f_string_quote: |
1001 | 916k | if (((Py_TOLOWER(*tok->start) == 'f' || Py_TOLOWER(*tok->start) == 'r' || Py_TOLOWER(*tok->start) == 't') |
1002 | 916k | && (c == '\'' || c == '"'))) { |
1003 | | |
1004 | 16.8k | int quote = c; |
1005 | 16.8k | int quote_size = 1; /* 1 or 3 */ |
1006 | | |
1007 | | /* Nodes of type STRING, especially multi line strings |
1008 | | must be handled differently in order to get both |
1009 | | the starting line number and the column offset right. |
1010 | | (cf. issue 16806) */ |
1011 | 16.8k | tok->first_lineno = tok->lineno; |
1012 | 16.8k | tok->multi_line_start = tok->line_start; |
1013 | | |
1014 | | /* Find the quote size and start of string */ |
1015 | 16.8k | int after_quote = tok_nextc(tok); |
1016 | 16.8k | if (after_quote == quote) { |
1017 | 2.34k | int after_after_quote = tok_nextc(tok); |
1018 | 2.34k | if (after_after_quote == quote) { |
1019 | 791 | quote_size = 3; |
1020 | 791 | } |
1021 | 1.55k | else { |
1022 | | // TODO: Check this |
1023 | 1.55k | tok_backup(tok, after_after_quote); |
1024 | 1.55k | tok_backup(tok, after_quote); |
1025 | 1.55k | } |
1026 | 2.34k | } |
1027 | 16.8k | if (after_quote != quote) { |
1028 | 14.4k | tok_backup(tok, after_quote); |
1029 | 14.4k | } |
1030 | | |
1031 | | |
1032 | 16.8k | p_start = tok->start; |
1033 | 16.8k | p_end = tok->cur; |
1034 | 16.8k | if (tok->tok_mode_stack_index + 1 >= MAXFSTRINGLEVEL) { |
1035 | 2 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "too many nested f-strings or t-strings")); |
1036 | 2 | } |
1037 | 16.8k | tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok); |
1038 | 16.8k | the_current_tok->kind = TOK_FSTRING_MODE; |
1039 | 16.8k | the_current_tok->quote = quote; |
1040 | 16.8k | the_current_tok->quote_size = quote_size; |
1041 | 16.8k | the_current_tok->start = tok->start; |
1042 | 16.8k | the_current_tok->multi_line_start = tok->line_start; |
1043 | 16.8k | the_current_tok->first_line = tok->lineno; |
1044 | 16.8k | the_current_tok->start_offset = -1; |
1045 | 16.8k | the_current_tok->multi_line_start_offset = -1; |
1046 | 16.8k | the_current_tok->last_expr_buffer = NULL; |
1047 | 16.8k | the_current_tok->last_expr_size = 0; |
1048 | 16.8k | the_current_tok->last_expr_end = -1; |
1049 | 16.8k | the_current_tok->in_format_spec = 0; |
1050 | 16.8k | the_current_tok->in_debug = 0; |
1051 | | |
1052 | 16.8k | enum string_kind_t string_kind = FSTRING; |
1053 | 16.8k | switch (*tok->start) { |
1054 | 784 | case 'T': |
1055 | 3.41k | case 't': |
1056 | 3.41k | the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r'; |
1057 | 3.41k | string_kind = TSTRING; |
1058 | 3.41k | break; |
1059 | 2.08k | case 'F': |
1060 | 12.7k | case 'f': |
1061 | 12.7k | the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r'; |
1062 | 12.7k | break; |
1063 | 284 | case 'R': |
1064 | 682 | case 'r': |
1065 | 682 | the_current_tok->raw = 1; |
1066 | 682 | if (Py_TOLOWER(*(tok->start + 1)) == 't') { |
1067 | 209 | string_kind = TSTRING; |
1068 | 209 | } |
1069 | 682 | break; |
1070 | 0 | default: |
1071 | 0 | Py_UNREACHABLE(); |
1072 | 16.8k | } |
1073 | | |
1074 | 16.8k | the_current_tok->string_kind = string_kind; |
1075 | 16.8k | the_current_tok->curly_bracket_depth = 0; |
1076 | 16.8k | the_current_tok->curly_bracket_expr_start_depth = -1; |
1077 | 16.8k | return string_kind == TSTRING ? MAKE_TOKEN(TSTRING_START) : MAKE_TOKEN(FSTRING_START); |
1078 | 16.8k | } |
1079 | | |
1080 | 904k | letter_quote: |
1081 | | /* String */ |
1082 | 904k | if (c == '\'' || c == '"') { |
1083 | 56.5k | int quote = c; |
1084 | 56.5k | int quote_size = 1; /* 1 or 3 */ |
1085 | 56.5k | int end_quote_size = 0; |
1086 | 56.5k | int has_escaped_quote = 0; |
1087 | | |
1088 | | /* Nodes of type STRING, especially multi line strings |
1089 | | must be handled differently in order to get both |
1090 | | the starting line number and the column offset right. |
1091 | | (cf. issue 16806) */ |
1092 | 56.5k | tok->first_lineno = tok->lineno; |
1093 | 56.5k | tok->multi_line_start = tok->line_start; |
1094 | | |
1095 | | /* Find the quote size and start of string */ |
1096 | 56.5k | c = tok_nextc(tok); |
1097 | 56.5k | if (c == quote) { |
1098 | 10.0k | c = tok_nextc(tok); |
1099 | 10.0k | if (c == quote) { |
1100 | 2.43k | quote_size = 3; |
1101 | 2.43k | } |
1102 | 7.60k | else { |
1103 | 7.60k | end_quote_size = 1; /* empty string found */ |
1104 | 7.60k | } |
1105 | 10.0k | } |
1106 | 56.5k | if (c != quote) { |
1107 | 54.1k | tok_backup(tok, c); |
1108 | 54.1k | } |
1109 | | |
1110 | | /* Get rest of string */ |
1111 | 1.13M | while (end_quote_size != quote_size) { |
1112 | 1.07M | c = tok_nextc(tok); |
1113 | 1.07M | if (tok->done == E_ERROR) { |
1114 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1115 | 0 | } |
1116 | 1.07M | if (tok->done == E_DECODE) { |
1117 | 0 | break; |
1118 | 0 | } |
1119 | 1.07M | if (c == EOF || (quote_size == 1 && c == '\n')) { |
1120 | 440 | assert(tok->multi_line_start != NULL); |
1121 | | // shift the tok_state's location into |
1122 | | // the start of string, and report the error |
1123 | | // from the initial quote character |
1124 | 440 | tok->cur = (char *)tok->start; |
1125 | 440 | tok->cur++; |
1126 | 440 | tok->line_start = tok->multi_line_start; |
1127 | 440 | int start = tok->lineno; |
1128 | 440 | tok->lineno = tok->first_lineno; |
1129 | | |
1130 | 440 | if (INSIDE_FSTRING(tok)) { |
1131 | | /* When we are in an f-string, before raising the |
1132 | | * unterminated string literal error, check whether |
1133 | | * does the initial quote matches with f-strings quotes |
1134 | | * and if it is, then this must be a missing '}' token |
1135 | | * so raise the proper error */ |
1136 | 24 | tokenizer_mode *the_current_tok = TOK_GET_MODE(tok); |
1137 | 24 | if (the_current_tok->quote == quote && |
1138 | 24 | the_current_tok->quote_size == quote_size) { |
1139 | 11 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1140 | 11 | "%c-string: expecting '}'", TOK_GET_STRING_PREFIX(tok))); |
1141 | 11 | } |
1142 | 24 | } |
1143 | | |
1144 | 429 | if (quote_size == 3) { |
1145 | 15 | _PyTokenizer_syntaxerror(tok, "unterminated triple-quoted string literal" |
1146 | 15 | " (detected at line %d)", start); |
1147 | 15 | if (c != '\n') { |
1148 | 15 | tok->done = E_EOFS; |
1149 | 15 | } |
1150 | 15 | return MAKE_TOKEN(ERRORTOKEN); |
1151 | 15 | } |
1152 | 414 | else { |
1153 | 414 | if (has_escaped_quote) { |
1154 | 12 | _PyTokenizer_syntaxerror( |
1155 | 12 | tok, |
1156 | 12 | "unterminated string literal (detected at line %d); " |
1157 | 12 | "perhaps you escaped the end quote?", |
1158 | 12 | start |
1159 | 12 | ); |
1160 | 402 | } else { |
1161 | 402 | _PyTokenizer_syntaxerror( |
1162 | 402 | tok, "unterminated string literal (detected at line %d)", start |
1163 | 402 | ); |
1164 | 402 | } |
1165 | 414 | if (c != '\n') { |
1166 | 15 | tok->done = E_EOLS; |
1167 | 15 | } |
1168 | 414 | return MAKE_TOKEN(ERRORTOKEN); |
1169 | 414 | } |
1170 | 429 | } |
1171 | 1.07M | if (c == quote) { |
1172 | 54.8k | end_quote_size += 1; |
1173 | 54.8k | } |
1174 | 1.02M | else { |
1175 | 1.02M | end_quote_size = 0; |
1176 | 1.02M | if (c == '\\') { |
1177 | 32.6k | c = tok_nextc(tok); /* skip escaped char */ |
1178 | 32.6k | if (c == quote) { /* but record whether the escaped char was a quote */ |
1179 | 1.50k | has_escaped_quote = 1; |
1180 | 1.50k | } |
1181 | 32.6k | if (c == '\r') { |
1182 | 205 | c = tok_nextc(tok); |
1183 | 205 | } |
1184 | 32.6k | } |
1185 | 1.02M | } |
1186 | 1.07M | } |
1187 | | |
1188 | 56.1k | p_start = tok->start; |
1189 | 56.1k | p_end = tok->cur; |
1190 | 56.1k | return MAKE_TOKEN(STRING); |
1191 | 56.5k | } |
1192 | | |
1193 | | /* Line continuation */ |
1194 | 848k | if (c == '\\') { |
1195 | 427 | if ((c = tok_continuation_line(tok)) == -1) { |
1196 | 72 | return MAKE_TOKEN(ERRORTOKEN); |
1197 | 72 | } |
1198 | 355 | tok->cont_line = 1; |
1199 | 355 | goto again; /* Read next line */ |
1200 | 427 | } |
1201 | | |
1202 | | /* Punctuation character */ |
1203 | 847k | int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{'); |
1204 | 847k | if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) { |
1205 | | /* This code block gets executed before the curly_bracket_depth is incremented |
1206 | | * by the `{` case, so for ensuring that we are on the 0th level, we need |
1207 | | * to adjust it manually */ |
1208 | 53.6k | int cursor = current_tok->curly_bracket_depth - (c != '{'); |
1209 | 53.6k | int in_format_spec = current_tok->in_format_spec; |
1210 | 53.6k | int cursor_in_format_with_debug = |
1211 | 53.6k | cursor == 1 && (current_tok->in_debug || in_format_spec); |
1212 | 53.6k | int cursor_valid = cursor == 0 || cursor_in_format_with_debug; |
1213 | 53.6k | if ((cursor_valid) && !_PyLexer_update_ftstring_expr(tok, c)) { |
1214 | 0 | return MAKE_TOKEN(ENDMARKER); |
1215 | 0 | } |
1216 | 53.6k | if ((cursor_valid) && c != '{' && set_ftstring_expr(tok, token, c)) { |
1217 | 5 | return MAKE_TOKEN(ERRORTOKEN); |
1218 | 5 | } |
1219 | | |
1220 | 53.6k | if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) { |
1221 | 3.80k | current_tok->kind = TOK_FSTRING_MODE; |
1222 | 3.80k | current_tok->in_format_spec = 1; |
1223 | 3.80k | p_start = tok->start; |
1224 | 3.80k | p_end = tok->cur; |
1225 | 3.80k | return MAKE_TOKEN(_PyToken_OneChar(c)); |
1226 | 3.80k | } |
1227 | 53.6k | } |
1228 | | |
1229 | | /* Check for two-character token */ |
1230 | 844k | { |
1231 | 844k | int c2 = tok_nextc(tok); |
1232 | 844k | int current_token = _PyToken_TwoChars(c, c2); |
1233 | 844k | if (current_token != OP) { |
1234 | 22.7k | int c3 = tok_nextc(tok); |
1235 | 22.7k | int current_token3 = _PyToken_ThreeChars(c, c2, c3); |
1236 | 22.7k | if (current_token3 != OP) { |
1237 | 1.21k | current_token = current_token3; |
1238 | 1.21k | } |
1239 | 21.5k | else { |
1240 | 21.5k | tok_backup(tok, c3); |
1241 | 21.5k | } |
1242 | 22.7k | p_start = tok->start; |
1243 | 22.7k | p_end = tok->cur; |
1244 | 22.7k | return MAKE_TOKEN(current_token); |
1245 | 22.7k | } |
1246 | 821k | tok_backup(tok, c2); |
1247 | 821k | } |
1248 | | |
1249 | | /* Keep track of parentheses nesting level */ |
1250 | 0 | switch (c) { |
1251 | 90.1k | case '(': |
1252 | 126k | case '[': |
1253 | 171k | case '{': |
1254 | 171k | if (tok->level >= MAXLEVEL) { |
1255 | 3 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "too many nested parentheses")); |
1256 | 3 | } |
1257 | 171k | tok->parenstack[tok->level] = c; |
1258 | 171k | tok->parenlinenostack[tok->level] = tok->lineno; |
1259 | 171k | tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start); |
1260 | 171k | tok->level++; |
1261 | 171k | if (INSIDE_FSTRING(tok)) { |
1262 | 30.5k | current_tok->curly_bracket_depth++; |
1263 | 30.5k | } |
1264 | 171k | break; |
1265 | 58.0k | case ')': |
1266 | 70.0k | case ']': |
1267 | 96.9k | case '}': |
1268 | 96.9k | if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') { |
1269 | 48 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1270 | 48 | "%c-string: single '}' is not allowed", TOK_GET_STRING_PREFIX(tok))); |
1271 | 48 | } |
1272 | 96.9k | if (!tok->tok_extra_tokens && !tok->level) { |
1273 | 215 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "unmatched '%c'", c)); |
1274 | 215 | } |
1275 | 96.7k | if (tok->level > 0) { |
1276 | 96.7k | tok->level--; |
1277 | 96.7k | int opening = tok->parenstack[tok->level]; |
1278 | 96.7k | if (!tok->tok_extra_tokens && !((opening == '(' && c == ')') || |
1279 | 96.7k | (opening == '[' && c == ']') || |
1280 | 96.7k | (opening == '{' && c == '}'))) { |
1281 | | /* If the opening bracket belongs to an f-string's expression |
1282 | | part (e.g. f"{)}") and the closing bracket is an arbitrary |
1283 | | nested expression, then instead of matching a different |
1284 | | syntactical construct with it; we'll throw an unmatched |
1285 | | parentheses error. */ |
1286 | 46 | if (INSIDE_FSTRING(tok) && opening == '{') { |
1287 | 7 | assert(current_tok->curly_bracket_depth >= 0); |
1288 | 7 | int previous_bracket = current_tok->curly_bracket_depth - 1; |
1289 | 7 | if (previous_bracket == current_tok->curly_bracket_expr_start_depth) { |
1290 | 5 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1291 | 5 | "%c-string: unmatched '%c'", TOK_GET_STRING_PREFIX(tok), c)); |
1292 | 5 | } |
1293 | 7 | } |
1294 | 41 | if (tok->parenlinenostack[tok->level] != tok->lineno) { |
1295 | 5 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1296 | 5 | "closing parenthesis '%c' does not match " |
1297 | 5 | "opening parenthesis '%c' on line %d", |
1298 | 5 | c, opening, tok->parenlinenostack[tok->level])); |
1299 | 5 | } |
1300 | 36 | else { |
1301 | 36 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1302 | 36 | "closing parenthesis '%c' does not match " |
1303 | 36 | "opening parenthesis '%c'", |
1304 | 36 | c, opening)); |
1305 | 36 | } |
1306 | 41 | } |
1307 | 96.7k | } |
1308 | | |
1309 | 96.6k | if (INSIDE_FSTRING(tok)) { |
1310 | 22.8k | current_tok->curly_bracket_depth--; |
1311 | 22.8k | if (current_tok->curly_bracket_depth < 0) { |
1312 | 1 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "%c-string: unmatched '%c'", |
1313 | 1 | TOK_GET_STRING_PREFIX(tok), c)); |
1314 | 1 | } |
1315 | 22.8k | if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) { |
1316 | 20.7k | current_tok->curly_bracket_expr_start_depth--; |
1317 | 20.7k | current_tok->kind = TOK_FSTRING_MODE; |
1318 | 20.7k | current_tok->in_format_spec = 0; |
1319 | 20.7k | current_tok->in_debug = 0; |
1320 | 20.7k | } |
1321 | 22.8k | } |
1322 | 96.6k | break; |
1323 | 552k | default: |
1324 | 552k | break; |
1325 | 821k | } |
1326 | | |
1327 | 821k | if (!Py_UNICODE_ISPRINTABLE(c)) { |
1328 | 480 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", c)); |
1329 | 480 | } |
1330 | | |
1331 | 820k | if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) { |
1332 | 41.8k | current_tok->in_debug = 1; |
1333 | 41.8k | } |
1334 | | |
1335 | | /* Punctuation character */ |
1336 | 820k | p_start = tok->start; |
1337 | 820k | p_end = tok->cur; |
1338 | 820k | return MAKE_TOKEN(_PyToken_OneChar(c)); |
1339 | 821k | } |
1340 | | |
1341 | | static int |
1342 | | tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token) |
1343 | 54.0k | { |
1344 | 54.0k | const char *p_start = NULL; |
1345 | 54.0k | const char *p_end = NULL; |
1346 | 54.0k | int end_quote_size = 0; |
1347 | 54.0k | int unicode_escape = 0; |
1348 | | |
1349 | 54.0k | tok->start = tok->cur; |
1350 | 54.0k | tok->first_lineno = tok->lineno; |
1351 | 54.0k | tok->starting_col_offset = tok->col_offset; |
1352 | | |
1353 | | // If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize |
1354 | | // before it. |
1355 | 54.0k | int start_char = tok_nextc(tok); |
1356 | 54.0k | if (start_char == '{') { |
1357 | 13.2k | int peek1 = tok_nextc(tok); |
1358 | 13.2k | tok_backup(tok, peek1); |
1359 | 13.2k | tok_backup(tok, start_char); |
1360 | 13.2k | if (peek1 != '{') { |
1361 | 9.67k | current_tok->curly_bracket_expr_start_depth++; |
1362 | 9.67k | if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { |
1363 | 4 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1364 | 4 | "%c-string: expressions nested too deeply", TOK_GET_STRING_PREFIX(tok))); |
1365 | 4 | } |
1366 | 9.66k | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1367 | 9.66k | return tok_get_normal_mode(tok, current_tok, token); |
1368 | 9.67k | } |
1369 | 13.2k | } |
1370 | 40.8k | else { |
1371 | 40.8k | tok_backup(tok, start_char); |
1372 | 40.8k | } |
1373 | | |
1374 | | // Check if we are at the end of the string |
1375 | 63.2k | for (int i = 0; i < current_tok->quote_size; i++) { |
1376 | 51.0k | int quote = tok_nextc(tok); |
1377 | 51.0k | if (quote != current_tok->quote) { |
1378 | 32.1k | tok_backup(tok, quote); |
1379 | 32.1k | goto f_string_middle; |
1380 | 32.1k | } |
1381 | 51.0k | } |
1382 | | |
1383 | 12.2k | if (current_tok->last_expr_buffer != NULL) { |
1384 | 5.99k | PyMem_Free(current_tok->last_expr_buffer); |
1385 | 5.99k | current_tok->last_expr_buffer = NULL; |
1386 | 5.99k | current_tok->last_expr_size = 0; |
1387 | 5.99k | current_tok->last_expr_end = -1; |
1388 | 5.99k | } |
1389 | | |
1390 | 12.2k | p_start = tok->start; |
1391 | 12.2k | p_end = tok->cur; |
1392 | 12.2k | tok->tok_mode_stack_index--; |
1393 | 12.2k | return MAKE_TOKEN(FTSTRING_END(current_tok)); |
1394 | | |
1395 | 32.1k | f_string_middle: |
1396 | | |
1397 | | // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle |
1398 | | // this. |
1399 | 32.1k | tok->multi_line_start = tok->line_start; |
1400 | 160k | while (end_quote_size != current_tok->quote_size) { |
1401 | 154k | int c = tok_nextc(tok); |
1402 | 154k | if (tok->done == E_ERROR || tok->done == E_DECODE) { |
1403 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1404 | 0 | } |
1405 | 154k | int in_format_spec = ( |
1406 | 154k | current_tok->in_format_spec |
1407 | 154k | && |
1408 | 154k | INSIDE_FSTRING_EXPR(current_tok) |
1409 | 154k | ); |
1410 | | |
1411 | 154k | if (c == EOF || (current_tok->quote_size == 1 && c == '\n')) { |
1412 | 464 | if (tok->decoding_erred) { |
1413 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1414 | 0 | } |
1415 | | |
1416 | | // If we are in a format spec and we found a newline, |
1417 | | // it means that the format spec ends here and we should |
1418 | | // return to the regular mode. |
1419 | 464 | if (in_format_spec && c == '\n') { |
1420 | 75 | if (current_tok->quote_size == 1) { |
1421 | 75 | return MAKE_TOKEN( |
1422 | 75 | _PyTokenizer_syntaxerror( |
1423 | 75 | tok, |
1424 | 75 | "%c-string: newlines are not allowed in format specifiers for single quoted %c-strings", |
1425 | 75 | TOK_GET_STRING_PREFIX(tok), TOK_GET_STRING_PREFIX(tok) |
1426 | 75 | ) |
1427 | 75 | ); |
1428 | 75 | } |
1429 | 0 | tok_backup(tok, c); |
1430 | 0 | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1431 | 0 | current_tok->in_format_spec = 0; |
1432 | 0 | p_start = tok->start; |
1433 | 0 | p_end = tok->cur; |
1434 | 0 | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1435 | 75 | } |
1436 | | |
1437 | 389 | assert(tok->multi_line_start != NULL); |
1438 | | // shift the tok_state's location into |
1439 | | // the start of string, and report the error |
1440 | | // from the initial quote character |
1441 | 389 | tok->cur = (char *)current_tok->start; |
1442 | 389 | tok->cur++; |
1443 | 389 | tok->line_start = current_tok->multi_line_start; |
1444 | 389 | int start = tok->lineno; |
1445 | | |
1446 | 389 | tokenizer_mode *the_current_tok = TOK_GET_MODE(tok); |
1447 | 389 | tok->lineno = the_current_tok->first_line; |
1448 | | |
1449 | 389 | if (current_tok->quote_size == 3) { |
1450 | 32 | _PyTokenizer_syntaxerror(tok, |
1451 | 32 | "unterminated triple-quoted %c-string literal" |
1452 | 32 | " (detected at line %d)", |
1453 | 32 | TOK_GET_STRING_PREFIX(tok), start); |
1454 | 32 | if (c != '\n') { |
1455 | 32 | tok->done = E_EOFS; |
1456 | 32 | } |
1457 | 32 | return MAKE_TOKEN(ERRORTOKEN); |
1458 | 32 | } |
1459 | 357 | else { |
1460 | 357 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1461 | 357 | "unterminated %c-string literal (detected at" |
1462 | 357 | " line %d)", TOK_GET_STRING_PREFIX(tok), start)); |
1463 | 357 | } |
1464 | 389 | } |
1465 | | |
1466 | 154k | if (c == current_tok->quote) { |
1467 | 8.64k | end_quote_size += 1; |
1468 | 8.64k | continue; |
1469 | 145k | } else { |
1470 | 145k | end_quote_size = 0; |
1471 | 145k | } |
1472 | | |
1473 | 145k | if (c == '{') { |
1474 | 20.3k | if (!_PyLexer_update_ftstring_expr(tok, c)) { |
1475 | 0 | return MAKE_TOKEN(ENDMARKER); |
1476 | 0 | } |
1477 | 20.3k | int peek = tok_nextc(tok); |
1478 | 20.3k | if (peek != '{' || in_format_spec) { |
1479 | 16.3k | tok_backup(tok, peek); |
1480 | 16.3k | tok_backup(tok, c); |
1481 | 16.3k | current_tok->curly_bracket_expr_start_depth++; |
1482 | 16.3k | if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { |
1483 | 5 | return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, |
1484 | 5 | "%c-string: expressions nested too deeply", TOK_GET_STRING_PREFIX(tok))); |
1485 | 5 | } |
1486 | 16.3k | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1487 | 16.3k | current_tok->in_format_spec = 0; |
1488 | 16.3k | p_start = tok->start; |
1489 | 16.3k | p_end = tok->cur; |
1490 | 16.3k | } else { |
1491 | 4.01k | p_start = tok->start; |
1492 | 4.01k | p_end = tok->cur - 1; |
1493 | 4.01k | } |
1494 | 20.3k | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1495 | 125k | } else if (c == '}') { |
1496 | 5.02k | if (unicode_escape) { |
1497 | 496 | p_start = tok->start; |
1498 | 496 | p_end = tok->cur; |
1499 | 496 | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1500 | 496 | } |
1501 | 4.52k | int peek = tok_nextc(tok); |
1502 | | |
1503 | | // The tokenizer can only be in the format spec if we have already completed the expression |
1504 | | // scanning (indicated by the end of the expression being set) and we are not at the top level |
1505 | | // of the bracket stack (-1 is the top level). Since format specifiers can't legally use double |
1506 | | // brackets, we can bypass it here. |
1507 | 4.52k | int cursor = current_tok->curly_bracket_depth; |
1508 | 4.52k | if (peek == '}' && !in_format_spec && cursor == 0) { |
1509 | 1.82k | p_start = tok->start; |
1510 | 1.82k | p_end = tok->cur - 1; |
1511 | 2.70k | } else { |
1512 | 2.70k | tok_backup(tok, peek); |
1513 | 2.70k | tok_backup(tok, c); |
1514 | 2.70k | TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; |
1515 | 2.70k | current_tok->in_format_spec = 0; |
1516 | 2.70k | p_start = tok->start; |
1517 | 2.70k | p_end = tok->cur; |
1518 | 2.70k | } |
1519 | 4.52k | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1520 | 120k | } else if (c == '\\') { |
1521 | 6.84k | int peek = tok_nextc(tok); |
1522 | 6.84k | if (peek == '\r') { |
1523 | 69 | peek = tok_nextc(tok); |
1524 | 69 | } |
1525 | | // Special case when the backslash is right before a curly |
1526 | | // brace. We have to restore and return the control back |
1527 | | // to the loop for the next iteration. |
1528 | 6.84k | if (peek == '{' || peek == '}') { |
1529 | 1.34k | if (!current_tok->raw) { |
1530 | 1.15k | if (_PyTokenizer_warn_invalid_escape_sequence(tok, peek)) { |
1531 | 0 | return MAKE_TOKEN(ERRORTOKEN); |
1532 | 0 | } |
1533 | 1.15k | } |
1534 | 1.34k | tok_backup(tok, peek); |
1535 | 1.34k | continue; |
1536 | 1.34k | } |
1537 | | |
1538 | 5.50k | if (!current_tok->raw) { |
1539 | 5.11k | if (peek == 'N') { |
1540 | | /* Handle named unicode escapes (\N{BULLET}) */ |
1541 | 740 | peek = tok_nextc(tok); |
1542 | 740 | if (peek == '{') { |
1543 | 523 | unicode_escape = 1; |
1544 | 523 | } else { |
1545 | 217 | tok_backup(tok, peek); |
1546 | 217 | } |
1547 | 740 | } |
1548 | 5.11k | } /* else { |
1549 | | skip the escaped character |
1550 | | }*/ |
1551 | 5.50k | } |
1552 | 145k | } |
1553 | | |
1554 | | // Backup the f-string quotes to emit a final FSTRING_MIDDLE and |
1555 | | // add the quotes to the FSTRING_END in the next tokenizer iteration. |
1556 | 13.5k | for (int i = 0; i < current_tok->quote_size; i++) { |
1557 | 7.22k | tok_backup(tok, current_tok->quote); |
1558 | 7.22k | } |
1559 | 6.35k | p_start = tok->start; |
1560 | 6.35k | p_end = tok->cur; |
1561 | 6.35k | return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok)); |
1562 | 32.1k | } |
1563 | | |
1564 | | static int |
1565 | | tok_get(struct tok_state *tok, struct token *token) |
1566 | 1.77M | { |
1567 | 1.77M | tokenizer_mode *current_tok = TOK_GET_MODE(tok); |
1568 | 1.77M | if (current_tok->kind == TOK_REGULAR_MODE) { |
1569 | 1.72M | return tok_get_normal_mode(tok, current_tok, token); |
1570 | 1.72M | } else { |
1571 | 54.0k | return tok_get_fstring_mode(tok, current_tok, token); |
1572 | 54.0k | } |
1573 | 1.77M | } |
1574 | | |
1575 | | int |
1576 | | _PyTokenizer_Get(struct tok_state *tok, struct token *token) |
1577 | 1.77M | { |
1578 | 1.77M | int result = tok_get(tok, token); |
1579 | 1.77M | if (tok->decoding_erred) { |
1580 | 0 | result = ERRORTOKEN; |
1581 | 0 | tok->done = E_DECODE; |
1582 | 0 | } |
1583 | 1.77M | return result; |
1584 | 1.77M | } |