Coverage Report

Created: 2025-07-04 06:49

/src/cpython/Parser/lexer/lexer.c
Line
Count
Source (jump to first uncovered line)
1
#include "Python.h"
2
#include "pycore_token.h"
3
#include "pycore_unicodeobject.h"
4
#include "errcode.h"
5
6
#include "state.h"
7
#include "../tokenizer/helpers.h"
8
9
/* Alternate tab spacing */
10
9.92k
#define ALTTABSIZE 1
11
12
1.76M
#define is_potential_identifier_start(c) (\
13
1.76M
              (c >= 'a' && c <= 'z')\
14
1.76M
               || (c >= 'A' && c <= 'Z')\
15
1.76M
               || c == '_'\
16
1.76M
               || (c >= 128))
17
18
2.34M
#define is_potential_identifier_char(c) (\
19
2.34M
              (c >= 'a' && c <= 'z')\
20
2.34M
               || (c >= 'A' && c <= 'Z')\
21
2.34M
               || (c >= '0' && c <= '9')\
22
2.34M
               || c == '_'\
23
2.34M
               || (c >= 128))
24
25
#ifdef Py_DEBUG
26
static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) {
27
    assert(tok->tok_mode_stack_index >= 0);
28
    assert(tok->tok_mode_stack_index < MAXFSTRINGLEVEL);
29
    return &(tok->tok_mode_stack[tok->tok_mode_stack_index]);
30
}
31
static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) {
32
    assert(tok->tok_mode_stack_index >= 0);
33
    assert(tok->tok_mode_stack_index + 1 < MAXFSTRINGLEVEL);
34
    return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]);
35
}
36
#else
37
1.89M
#define TOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index]))
38
16.8k
#define TOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index]))
39
#endif
40
41
#define FTSTRING_MIDDLE(tok_mode) (tok_mode->string_kind == TSTRING ? TSTRING_MIDDLE : FSTRING_MIDDLE)
42
#define FTSTRING_END(tok_mode) (tok_mode->string_kind == TSTRING ? TSTRING_END : FSTRING_END)
43
32
#define TOK_GET_STRING_PREFIX(tok) (TOK_GET_MODE(tok)->string_kind == TSTRING ? 't' : 'f')
44
1.77M
#define MAKE_TOKEN(token_type) _PyLexer_token_setup(tok, token, token_type, p_start, p_end)
45
0
#define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\
46
0
                _PyLexer_type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end))
47
48
/* Spaces in this constant are treated as "zero or more spaces or tabs" when
49
   tokenizing. */
50
static const char* type_comment_prefix = "# type: ";
51
52
static inline int
53
contains_null_bytes(const char* str, size_t size)
54
224k
{
55
224k
    return memchr(str, 0, size) != NULL;
56
224k
}
57
58
/* Get next char, updating state; error code goes into tok->done */
59
static int
60
tok_nextc(struct tok_state *tok)
61
10.6M
{
62
10.6M
    int rc;
63
10.9M
    for (;;) {
64
10.9M
        if (tok->cur != tok->inp) {
65
10.6M
            if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) {
66
0
                tok->done = E_COLUMNOVERFLOW;
67
0
                return EOF;
68
0
            }
69
10.6M
            tok->col_offset++;
70
10.6M
            return Py_CHARMASK(*tok->cur++); /* Fast path */
71
10.6M
        }
72
277k
        if (tok->done != E_OK) {
73
35.2k
            return EOF;
74
35.2k
        }
75
242k
        rc = tok->underflow(tok);
76
#if defined(Py_DEBUG)
77
        if (tok->debug) {
78
            fprintf(stderr, "line[%d] = ", tok->lineno);
79
            _PyTokenizer_print_escape(stderr, tok->cur, tok->inp - tok->cur);
80
            fprintf(stderr, "  tok->done = %d\n", tok->done);
81
        }
82
#endif
83
242k
        if (!rc) {
84
17.7k
            tok->cur = tok->inp;
85
17.7k
            return EOF;
86
17.7k
        }
87
224k
        tok->line_start = tok->cur;
88
89
224k
        if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) {
90
0
            _PyTokenizer_syntaxerror(tok, "source code cannot contain null bytes");
91
0
            tok->cur = tok->inp;
92
0
            return EOF;
93
0
        }
94
224k
    }
95
10.6M
    Py_UNREACHABLE();
96
10.6M
}
97
98
/* Back-up one character */
99
static void
100
tok_backup(struct tok_state *tok, int c)
101
3.73M
{
102
3.73M
    if (c != EOF) {
103
3.70M
        if (--tok->cur < tok->buf) {
104
0
            Py_FatalError("tokenizer beginning of buffer");
105
0
        }
106
3.70M
        if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) {
107
0
            Py_FatalError("tok_backup: wrong character");
108
0
        }
109
3.70M
        tok->col_offset--;
110
3.70M
    }
111
3.73M
}
112
113
static int
114
22.4k
set_ftstring_expr(struct tok_state* tok, struct token *token, char c) {
115
22.4k
    assert(token != NULL);
116
22.4k
    assert(c == '}' || c == ':' || c == '!');
117
22.4k
    tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
118
119
22.4k
    if (!(tok_mode->in_debug || tok_mode->string_kind == TSTRING) || token->metadata) {
120
13.7k
        return 0;
121
13.7k
    }
122
8.70k
    PyObject *res = NULL;
123
124
    // Check if there is a # character in the expression
125
8.70k
    int hash_detected = 0;
126
1.36M
    for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) {
127
1.35M
        if (tok_mode->last_expr_buffer[i] == '#') {
128
1.06k
            hash_detected = 1;
129
1.06k
            break;
130
1.06k
        }
131
1.35M
    }
132
133
8.70k
    if (hash_detected) {
134
1.06k
        Py_ssize_t input_length = tok_mode->last_expr_size - tok_mode->last_expr_end;
135
1.06k
        char *result = (char *)PyMem_Malloc((input_length + 1) * sizeof(char));
136
1.06k
        if (!result) {
137
0
            return -1;
138
0
        }
139
140
1.06k
        Py_ssize_t i = 0;
141
1.06k
        Py_ssize_t j = 0;
142
143
39.2k
        for (i = 0, j = 0; i < input_length; i++) {
144
38.1k
            if (tok_mode->last_expr_buffer[i] == '#') {
145
                // Skip characters until newline or end of string
146
20.3k
                while (i < input_length && tok_mode->last_expr_buffer[i] != '\0') {
147
19.3k
                    if (tok_mode->last_expr_buffer[i] == '\n') {
148
288
                        result[j++] = tok_mode->last_expr_buffer[i];
149
288
                        break;
150
288
                    }
151
19.1k
                    i++;
152
19.1k
                }
153
36.9k
            } else {
154
36.9k
                result[j++] = tok_mode->last_expr_buffer[i];
155
36.9k
            }
156
38.1k
        }
157
158
1.06k
        result[j] = '\0';  // Null-terminate the result string
159
1.06k
        res = PyUnicode_DecodeUTF8(result, j, NULL);
160
1.06k
        PyMem_Free(result);
161
7.63k
    } else {
162
7.63k
        res = PyUnicode_DecodeUTF8(
163
7.63k
            tok_mode->last_expr_buffer,
164
7.63k
            tok_mode->last_expr_size - tok_mode->last_expr_end,
165
7.63k
            NULL
166
7.63k
        );
167
168
7.63k
    }
169
170
171
8.70k
   if (!res) {
172
5
        return -1;
173
5
    }
174
8.69k
    token->metadata = res;
175
8.69k
    return 0;
176
8.70k
}
177
178
int
179
_PyLexer_update_ftstring_expr(struct tok_state *tok, char cur)
180
65.9k
{
181
65.9k
    assert(tok->cur != NULL);
182
183
65.9k
    Py_ssize_t size = strlen(tok->cur);
184
65.9k
    tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
185
186
65.9k
    switch (cur) {
187
0
       case 0:
188
0
            if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) {
189
0
                return 1;
190
0
            }
191
0
            char *new_buffer = PyMem_Realloc(
192
0
                tok_mode->last_expr_buffer,
193
0
                tok_mode->last_expr_size + size
194
0
            );
195
0
            if (new_buffer == NULL) {
196
0
                PyMem_Free(tok_mode->last_expr_buffer);
197
0
                goto error;
198
0
            }
199
0
            tok_mode->last_expr_buffer = new_buffer;
200
0
            strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size);
201
0
            tok_mode->last_expr_size += size;
202
0
            break;
203
43.5k
        case '{':
204
43.5k
            if (tok_mode->last_expr_buffer != NULL) {
205
33.1k
                PyMem_Free(tok_mode->last_expr_buffer);
206
33.1k
            }
207
43.5k
            tok_mode->last_expr_buffer = PyMem_Malloc(size);
208
43.5k
            if (tok_mode->last_expr_buffer == NULL) {
209
0
                goto error;
210
0
            }
211
43.5k
            tok_mode->last_expr_size = size;
212
43.5k
            tok_mode->last_expr_end = -1;
213
43.5k
            strncpy(tok_mode->last_expr_buffer, tok->cur, size);
214
43.5k
            break;
215
18.3k
        case '}':
216
19.8k
        case '!':
217
19.8k
            tok_mode->last_expr_end = strlen(tok->start);
218
19.8k
            break;
219
2.56k
        case ':':
220
2.56k
            if (tok_mode->last_expr_end == -1) {
221
2.33k
               tok_mode->last_expr_end = strlen(tok->start);
222
2.33k
            }
223
2.56k
            break;
224
0
        default:
225
0
            Py_UNREACHABLE();
226
65.9k
    }
227
65.9k
    return 1;
228
0
error:
229
0
    tok->done = E_NOMEM;
230
0
    return 0;
231
65.9k
}
232
233
static int
234
lookahead(struct tok_state *tok, const char *test)
235
8.92k
{
236
8.92k
    const char *s = test;
237
8.92k
    int res = 0;
238
23.4k
    while (1) {
239
23.4k
        int c = tok_nextc(tok);
240
23.4k
        if (*s == 0) {
241
8.82k
            res = !is_potential_identifier_char(c);
242
8.82k
        }
243
14.6k
        else if (c == *s) {
244
14.5k
            s++;
245
14.5k
            continue;
246
14.5k
        }
247
248
8.92k
        tok_backup(tok, c);
249
23.4k
        while (s != test) {
250
14.5k
            tok_backup(tok, *--s);
251
14.5k
        }
252
8.92k
        return res;
253
23.4k
    }
254
8.92k
}
255
256
static int
257
98.6k
verify_end_of_number(struct tok_state *tok, int c, const char *kind) {
258
98.6k
    if (tok->tok_extra_tokens) {
259
        // When we are parsing extra tokens, we don't want to emit warnings
260
        // about invalid literals, because we want to be a bit more liberal.
261
0
        return 1;
262
0
    }
263
    /* Emit a deprecation warning only if the numeric literal is immediately
264
     * followed by one of keywords which can occur after a numeric literal
265
     * in valid code: "and", "else", "for", "if", "in", "is" and "or".
266
     * It allows to gradually deprecate existing valid code without adding
267
     * warning before error in most cases of invalid numeric literal (which
268
     * would be confusing and break existing tests).
269
     * Raise a syntax error with slightly better message than plain
270
     * "invalid syntax" if the numeric literal is immediately followed by
271
     * other keyword or identifier.
272
     */
273
98.6k
    int r = 0;
274
98.6k
    if (c == 'a') {
275
838
        r = lookahead(tok, "nd");
276
838
    }
277
97.7k
    else if (c == 'e') {
278
469
        r = lookahead(tok, "lse");
279
469
    }
280
97.3k
    else if (c == 'f') {
281
3.55k
        r = lookahead(tok, "or");
282
3.55k
    }
283
93.7k
    else if (c == 'i') {
284
1.90k
        int c2 = tok_nextc(tok);
285
1.90k
        if (c2 == 'f' || c2 == 'n' || c2 == 's') {
286
1.88k
            r = 1;
287
1.88k
        }
288
1.90k
        tok_backup(tok, c2);
289
1.90k
    }
290
91.8k
    else if (c == 'o') {
291
3.54k
        r = lookahead(tok, "r");
292
3.54k
    }
293
88.3k
    else if (c == 'n') {
294
520
        r = lookahead(tok, "ot");
295
520
    }
296
98.6k
    if (r) {
297
10.6k
        tok_backup(tok, c);
298
10.6k
        if (_PyTokenizer_parser_warn(tok, PyExc_SyntaxWarning,
299
10.6k
                "invalid %s literal", kind))
300
0
        {
301
0
            return 0;
302
0
        }
303
10.6k
        tok_nextc(tok);
304
10.6k
    }
305
87.9k
    else /* In future releases, only error will remain. */
306
87.9k
    if (c < 128 && is_potential_identifier_char(c)) {
307
213
        tok_backup(tok, c);
308
213
        _PyTokenizer_syntaxerror(tok, "invalid %s literal", kind);
309
213
        return 0;
310
213
    }
311
98.4k
    return 1;
312
98.6k
}
313
314
/* Verify that the identifier follows PEP 3131. */
315
static int
316
verify_identifier(struct tok_state *tok)
317
14.2k
{
318
14.2k
    if (tok->tok_extra_tokens) {
319
0
        return 1;
320
0
    }
321
14.2k
    PyObject *s;
322
14.2k
    if (tok->decoding_erred)
323
0
        return 0;
324
14.2k
    s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
325
14.2k
    if (s == NULL) {
326
1.01k
        if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
327
1.01k
            tok->done = E_DECODE;
328
1.01k
        }
329
0
        else {
330
0
            tok->done = E_ERROR;
331
0
        }
332
1.01k
        return 0;
333
1.01k
    }
334
13.2k
    Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
335
13.2k
    assert(invalid >= 0);
336
13.2k
    assert(PyUnicode_GET_LENGTH(s) > 0);
337
13.2k
    if (invalid < PyUnicode_GET_LENGTH(s)) {
338
631
        Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
339
631
        if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
340
            /* Determine the offset in UTF-8 encoded input */
341
430
            Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
342
430
            if (s != NULL) {
343
430
                Py_SETREF(s, PyUnicode_AsUTF8String(s));
344
430
            }
345
430
            if (s == NULL) {
346
0
                tok->done = E_ERROR;
347
0
                return 0;
348
0
            }
349
430
            tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
350
430
        }
351
631
        Py_DECREF(s);
352
631
        if (Py_UNICODE_ISPRINTABLE(ch)) {
353
361
            _PyTokenizer_syntaxerror(tok, "invalid character '%c' (U+%04X)", ch, ch);
354
361
        }
355
270
        else {
356
270
            _PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", ch);
357
270
        }
358
631
        return 0;
359
631
    }
360
12.5k
    Py_DECREF(s);
361
12.5k
    return 1;
362
13.2k
}
363
364
static int
365
tok_decimal_tail(struct tok_state *tok)
366
80.0k
{
367
80.0k
    int c;
368
369
80.6k
    while (1) {
370
224k
        do {
371
224k
            c = tok_nextc(tok);
372
224k
        } while (Py_ISDIGIT(c));
373
80.6k
        if (c != '_') {
374
80.0k
            break;
375
80.0k
        }
376
562
        c = tok_nextc(tok);
377
562
        if (!Py_ISDIGIT(c)) {
378
15
            tok_backup(tok, c);
379
15
            _PyTokenizer_syntaxerror(tok, "invalid decimal literal");
380
15
            return 0;
381
15
        }
382
562
    }
383
80.0k
    return c;
384
80.0k
}
385
386
static inline int
387
1.20k
tok_continuation_line(struct tok_state *tok) {
388
1.20k
    int c = tok_nextc(tok);
389
1.20k
    if (c == '\r') {
390
68
        c = tok_nextc(tok);
391
68
    }
392
1.20k
    if (c != '\n') {
393
66
        tok->done = E_LINECONT;
394
66
        return -1;
395
66
    }
396
1.14k
    c = tok_nextc(tok);
397
1.14k
    if (c == EOF) {
398
36
        tok->done = E_EOF;
399
36
        tok->cur = tok->inp;
400
36
        return -1;
401
1.10k
    } else {
402
1.10k
        tok_backup(tok, c);
403
1.10k
    }
404
1.10k
    return c;
405
1.14k
}
406
407
static int
408
maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok,
409
                                             int saw_b, int saw_r, int saw_u,
410
22.0k
                                             int saw_f, int saw_t) {
411
    // Supported: rb, rf, rt (in any order)
412
    // Unsupported: ub, ur, uf, ut, bf, bt, ft (in any order)
413
414
22.0k
#define RETURN_SYNTAX_ERROR(PREFIX1, PREFIX2)                             \
415
22.0k
    do {                                                                  \
416
8
        (void)_PyTokenizer_syntaxerror_known_range(                       \
417
8
            tok, (int)(tok->start + 1 - tok->line_start),                 \
418
8
            (int)(tok->cur - tok->line_start),                            \
419
8
            "'" PREFIX1 "' and '" PREFIX2 "' prefixes are incompatible"); \
420
8
        return -1;                                                        \
421
8
    } while (0)
422
423
22.0k
    if (saw_u && saw_b) {
424
1
        RETURN_SYNTAX_ERROR("u", "b");
425
1
    }
426
22.0k
    if (saw_u && saw_r) {
427
1
        RETURN_SYNTAX_ERROR("u", "r");
428
1
    }
429
22.0k
    if (saw_u && saw_f) {
430
1
        RETURN_SYNTAX_ERROR("u", "f");
431
1
    }
432
22.0k
    if (saw_u && saw_t) {
433
1
        RETURN_SYNTAX_ERROR("u", "t");
434
1
    }
435
436
22.0k
    if (saw_b && saw_f) {
437
2
        RETURN_SYNTAX_ERROR("b", "f");
438
2
    }
439
22.0k
    if (saw_b && saw_t) {
440
1
        RETURN_SYNTAX_ERROR("b", "t");
441
1
    }
442
443
22.0k
    if (saw_f && saw_t) {
444
1
        RETURN_SYNTAX_ERROR("f", "t");
445
1
    }
446
447
22.0k
#undef RETURN_SYNTAX_ERROR
448
449
22.0k
    return 0;
450
22.0k
}
451
452
static int
453
tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
454
1.73M
{
455
1.73M
    int c;
456
1.73M
    int blankline, nonascii;
457
458
1.73M
    const char *p_start = NULL;
459
1.73M
    const char *p_end = NULL;
460
1.81M
  nextline:
461
1.81M
    tok->start = NULL;
462
1.81M
    tok->starting_col_offset = -1;
463
1.81M
    blankline = 0;
464
465
466
    /* Get indentation level */
467
1.81M
    if (tok->atbol) {
468
225k
        int col = 0;
469
225k
        int altcol = 0;
470
225k
        tok->atbol = 0;
471
225k
        int cont_line_col = 0;
472
839k
        for (;;) {
473
839k
            c = tok_nextc(tok);
474
839k
            if (c == ' ') {
475
606k
                col++, altcol++;
476
606k
            }
477
233k
            else if (c == '\t') {
478
4.96k
                col = (col / tok->tabsize + 1) * tok->tabsize;
479
4.96k
                altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
480
4.96k
            }
481
228k
            else if (c == '\014')  {/* Control-L (formfeed) */
482
2.11k
                col = altcol = 0; /* For Emacs users */
483
2.11k
            }
484
226k
            else if (c == '\\') {
485
                // Indentation cannot be split over multiple physical lines
486
                // using backslashes. This means that if we found a backslash
487
                // preceded by whitespace, **the first one we find** determines
488
                // the level of indentation of whatever comes next.
489
782
                cont_line_col = cont_line_col ? cont_line_col : col;
490
782
                if ((c = tok_continuation_line(tok)) == -1) {
491
30
                    return MAKE_TOKEN(ERRORTOKEN);
492
30
                }
493
782
            }
494
225k
            else {
495
225k
                break;
496
225k
            }
497
839k
        }
498
225k
        tok_backup(tok, c);
499
225k
        if (c == '#' || c == '\n' || c == '\r') {
500
            /* Lines with only whitespace and/or comments
501
               shouldn't affect the indentation and are
502
               not passed to the parser as NEWLINE tokens,
503
               except *totally* empty lines in interactive
504
               mode, which signal the end of a command group. */
505
44.6k
            if (col == 0 && c == '\n' && tok->prompt != NULL) {
506
0
                blankline = 0; /* Let it through */
507
0
            }
508
44.6k
            else if (tok->prompt != NULL && tok->lineno == 1) {
509
                /* In interactive mode, if the first line contains
510
                   only spaces and/or a comment, let it through. */
511
0
                blankline = 0;
512
0
                col = altcol = 0;
513
0
            }
514
44.6k
            else {
515
44.6k
                blankline = 1; /* Ignore completely */
516
44.6k
            }
517
            /* We can't jump back right here since we still
518
               may need to skip to the end of a comment */
519
44.6k
        }
520
225k
        if (!blankline && tok->level == 0) {
521
141k
            col = cont_line_col ? cont_line_col : col;
522
141k
            altcol = cont_line_col ? cont_line_col : altcol;
523
141k
            if (col == tok->indstack[tok->indent]) {
524
                /* No change */
525
107k
                if (altcol != tok->altindstack[tok->indent]) {
526
1
                    return MAKE_TOKEN(_PyTokenizer_indenterror(tok));
527
1
                }
528
107k
            }
529
34.1k
            else if (col > tok->indstack[tok->indent]) {
530
                /* Indent -- always one */
531
19.1k
                if (tok->indent+1 >= MAXINDENT) {
532
0
                    tok->done = E_TOODEEP;
533
0
                    tok->cur = tok->inp;
534
0
                    return MAKE_TOKEN(ERRORTOKEN);
535
0
                }
536
19.1k
                if (altcol <= tok->altindstack[tok->indent]) {
537
3
                    return MAKE_TOKEN(_PyTokenizer_indenterror(tok));
538
3
                }
539
19.1k
                tok->pendin++;
540
19.1k
                tok->indstack[++tok->indent] = col;
541
19.1k
                tok->altindstack[tok->indent] = altcol;
542
19.1k
            }
543
14.9k
            else /* col < tok->indstack[tok->indent] */ {
544
                /* Dedent -- any number, must be consistent */
545
33.3k
                while (tok->indent > 0 &&
546
33.3k
                    col < tok->indstack[tok->indent]) {
547
18.3k
                    tok->pendin--;
548
18.3k
                    tok->indent--;
549
18.3k
                }
550
14.9k
                if (col != tok->indstack[tok->indent]) {
551
8
                    tok->done = E_DEDENT;
552
8
                    tok->cur = tok->inp;
553
8
                    return MAKE_TOKEN(ERRORTOKEN);
554
8
                }
555
14.9k
                if (altcol != tok->altindstack[tok->indent]) {
556
1
                    return MAKE_TOKEN(_PyTokenizer_indenterror(tok));
557
1
                }
558
14.9k
            }
559
141k
        }
560
225k
    }
561
562
1.81M
    tok->start = tok->cur;
563
1.81M
    tok->starting_col_offset = tok->col_offset;
564
565
    /* Return pending indents/dedents */
566
1.81M
    if (tok->pendin != 0) {
567
37.5k
        if (tok->pendin < 0) {
568
18.3k
            if (tok->tok_extra_tokens) {
569
0
                p_start = tok->cur;
570
0
                p_end = tok->cur;
571
0
            }
572
18.3k
            tok->pendin++;
573
18.3k
            return MAKE_TOKEN(DEDENT);
574
18.3k
        }
575
19.1k
        else {
576
19.1k
            if (tok->tok_extra_tokens) {
577
0
                p_start = tok->buf;
578
0
                p_end = tok->cur;
579
0
            }
580
19.1k
            tok->pendin--;
581
19.1k
            return MAKE_TOKEN(INDENT);
582
19.1k
        }
583
37.5k
    }
584
585
    /* Peek ahead at the next character */
586
1.77M
    c = tok_nextc(tok);
587
1.77M
    tok_backup(tok, c);
588
589
1.77M
 again:
590
1.77M
    tok->start = NULL;
591
    /* Skip spaces */
592
2.11M
    do {
593
2.11M
        c = tok_nextc(tok);
594
2.11M
    } while (c == ' ' || c == '\t' || c == '\014');
595
596
    /* Set start of current token */
597
1.77M
    tok->start = tok->cur == NULL ? NULL : tok->cur - 1;
598
1.77M
    tok->starting_col_offset = tok->col_offset - 1;
599
600
    /* Skip comment, unless it's a type comment */
601
1.77M
    if (c == '#') {
602
603
42.0k
        const char* p = NULL;
604
42.0k
        const char *prefix, *type_start;
605
42.0k
        int current_starting_col_offset;
606
607
1.26M
        while (c != EOF && c != '\n' && c != '\r') {
608
1.22M
            c = tok_nextc(tok);
609
1.22M
        }
610
611
42.0k
        if (tok->tok_extra_tokens) {
612
0
            p = tok->start;
613
0
        }
614
615
42.0k
        if (tok->type_comments) {
616
0
            p = tok->start;
617
0
            current_starting_col_offset = tok->starting_col_offset;
618
0
            prefix = type_comment_prefix;
619
0
            while (*prefix && p < tok->cur) {
620
0
                if (*prefix == ' ') {
621
0
                    while (*p == ' ' || *p == '\t') {
622
0
                        p++;
623
0
                        current_starting_col_offset++;
624
0
                    }
625
0
                } else if (*prefix == *p) {
626
0
                    p++;
627
0
                    current_starting_col_offset++;
628
0
                } else {
629
0
                    break;
630
0
                }
631
632
0
                prefix++;
633
0
            }
634
635
            /* This is a type comment if we matched all of type_comment_prefix. */
636
0
            if (!*prefix) {
637
0
                int is_type_ignore = 1;
638
                // +6 in order to skip the word 'ignore'
639
0
                const char *ignore_end = p + 6;
640
0
                const int ignore_end_col_offset = current_starting_col_offset + 6;
641
0
                tok_backup(tok, c);  /* don't eat the newline or EOF */
642
643
0
                type_start = p;
644
645
                /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
646
                 * or anything ASCII and non-alphanumeric. */
647
0
                is_type_ignore = (
648
0
                    tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
649
0
                    && !(tok->cur > ignore_end
650
0
                         && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
651
652
0
                if (is_type_ignore) {
653
0
                    p_start = ignore_end;
654
0
                    p_end = tok->cur;
655
656
                    /* If this type ignore is the only thing on the line, consume the newline also. */
657
0
                    if (blankline) {
658
0
                        tok_nextc(tok);
659
0
                        tok->atbol = 1;
660
0
                    }
661
0
                    return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset);
662
0
                } else {
663
0
                    p_start = type_start;
664
0
                    p_end = tok->cur;
665
0
                    return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset);
666
0
                }
667
0
            }
668
0
        }
669
42.0k
        if (tok->tok_extra_tokens) {
670
0
            tok_backup(tok, c);  /* don't eat the newline or EOF */
671
0
            p_start = p;
672
0
            p_end = tok->cur;
673
0
            tok->comment_newline = blankline;
674
0
            return MAKE_TOKEN(COMMENT);
675
0
        }
676
42.0k
    }
677
678
1.77M
    if (tok->done == E_INTERACT_STOP) {
679
0
        return MAKE_TOKEN(ENDMARKER);
680
0
    }
681
682
    /* Check for EOF and errors now */
683
1.77M
    if (c == EOF) {
684
17.6k
        if (tok->level) {
685
4.31k
            return MAKE_TOKEN(ERRORTOKEN);
686
4.31k
        }
687
13.2k
        return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN);
688
17.6k
    }
689
690
    /* Identifier (most frequent token!) */
691
1.76M
    nonascii = 0;
692
1.76M
    if (is_potential_identifier_start(c)) {
693
        /* Process the various legal combinations of b"", r"", u"", and f"". */
694
531k
        int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0, saw_t = 0;
695
652k
        while (1) {
696
652k
            if (!saw_b && (c == 'b' || c == 'B')) {
697
21.1k
                saw_b = 1;
698
21.1k
            }
699
            /* Since this is a backwards compatibility support literal we don't
700
               want to support it in arbitrary order like byte literals. */
701
631k
            else if (!saw_u && (c == 'u'|| c == 'U')) {
702
6.45k
                saw_u = 1;
703
6.45k
            }
704
            /* ur"" and ru"" are not supported */
705
625k
            else if (!saw_r && (c == 'r' || c == 'R')) {
706
37.2k
                saw_r = 1;
707
37.2k
            }
708
587k
            else if (!saw_f && (c == 'f' || c == 'F')) {
709
47.0k
                saw_f = 1;
710
47.0k
            }
711
540k
            else if (!saw_t && (c == 't' || c == 'T')) {
712
31.6k
                saw_t = 1;
713
31.6k
            }
714
509k
            else {
715
509k
                break;
716
509k
            }
717
143k
            c = tok_nextc(tok);
718
143k
            if (c == '"' || c == '\'') {
719
                // Raise error on incompatible string prefixes:
720
22.0k
                int status = maybe_raise_syntax_error_for_string_prefixes(
721
22.0k
                    tok, saw_b, saw_r, saw_u, saw_f, saw_t);
722
22.0k
                if (status < 0) {
723
8
                    return MAKE_TOKEN(ERRORTOKEN);
724
8
                }
725
726
                // Handle valid f or t string creation:
727
22.0k
                if (saw_f || saw_t) {
728
16.8k
                    goto f_string_quote;
729
16.8k
                }
730
5.20k
                goto letter_quote;
731
22.0k
            }
732
143k
        }
733
2.25M
        while (is_potential_identifier_char(c)) {
734
1.74M
            if (c >= 128) {
735
198k
                nonascii = 1;
736
198k
            }
737
1.74M
            c = tok_nextc(tok);
738
1.74M
        }
739
509k
        tok_backup(tok, c);
740
509k
        if (nonascii && !verify_identifier(tok)) {
741
1.64k
            return MAKE_TOKEN(ERRORTOKEN);
742
1.64k
        }
743
744
507k
        p_start = tok->start;
745
507k
        p_end = tok->cur;
746
747
507k
        return MAKE_TOKEN(NAME);
748
509k
    }
749
750
1.23M
    if (c == '\r') {
751
436
        c = tok_nextc(tok);
752
436
    }
753
754
    /* Newline */
755
1.23M
    if (c == '\n') {
756
203k
        tok->atbol = 1;
757
203k
        if (blankline || tok->level > 0) {
758
83.8k
            if (tok->tok_extra_tokens) {
759
0
                if (tok->comment_newline) {
760
0
                    tok->comment_newline = 0;
761
0
                }
762
0
                p_start = tok->start;
763
0
                p_end = tok->cur;
764
0
                return MAKE_TOKEN(NL);
765
0
            }
766
83.8k
            goto nextline;
767
83.8k
        }
768
120k
        if (tok->comment_newline && tok->tok_extra_tokens) {
769
0
            tok->comment_newline = 0;
770
0
            p_start = tok->start;
771
0
            p_end = tok->cur;
772
0
            return MAKE_TOKEN(NL);
773
0
        }
774
120k
        p_start = tok->start;
775
120k
        p_end = tok->cur - 1; /* Leave '\n' out of the string */
776
120k
        tok->cont_line = 0;
777
120k
        return MAKE_TOKEN(NEWLINE);
778
120k
    }
779
780
    /* Period or number starting with period? */
781
1.02M
    if (c == '.') {
782
31.8k
        c = tok_nextc(tok);
783
31.8k
        if (Py_ISDIGIT(c)) {
784
3.27k
            goto fraction;
785
28.5k
        } else if (c == '.') {
786
3.49k
            c = tok_nextc(tok);
787
3.49k
            if (c == '.') {
788
2.86k
                p_start = tok->start;
789
2.86k
                p_end = tok->cur;
790
2.86k
                return MAKE_TOKEN(ELLIPSIS);
791
2.86k
            }
792
634
            else {
793
634
                tok_backup(tok, c);
794
634
            }
795
634
            tok_backup(tok, '.');
796
634
        }
797
25.0k
        else {
798
25.0k
            tok_backup(tok, c);
799
25.0k
        }
800
25.7k
        p_start = tok->start;
801
25.7k
        p_end = tok->cur;
802
25.7k
        return MAKE_TOKEN(DOT);
803
31.8k
    }
804
805
    /* Number */
806
995k
    if (Py_ISDIGIT(c)) {
807
95.4k
        if (c == '0') {
808
            /* Hex, octal or binary -- maybe. */
809
32.6k
            c = tok_nextc(tok);
810
32.6k
            if (c == 'x' || c == 'X') {
811
                /* Hex */
812
15.9k
                c = tok_nextc(tok);
813
16.1k
                do {
814
16.1k
                    if (c == '_') {
815
231
                        c = tok_nextc(tok);
816
231
                    }
817
16.1k
                    if (!Py_ISXDIGIT(c)) {
818
19
                        tok_backup(tok, c);
819
19
                        return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid hexadecimal literal"));
820
19
                    }
821
79.3k
                    do {
822
79.3k
                        c = tok_nextc(tok);
823
79.3k
                    } while (Py_ISXDIGIT(c));
824
16.1k
                } while (c == '_');
825
15.9k
                if (!verify_end_of_number(tok, c, "hexadecimal")) {
826
3
                    return MAKE_TOKEN(ERRORTOKEN);
827
3
                }
828
15.9k
            }
829
16.6k
            else if (c == 'o' || c == 'O') {
830
                /* Octal */
831
742
                c = tok_nextc(tok);
832
1.43k
                do {
833
1.43k
                    if (c == '_') {
834
695
                        c = tok_nextc(tok);
835
695
                    }
836
1.43k
                    if (c < '0' || c >= '8') {
837
27
                        if (Py_ISDIGIT(c)) {
838
1
                            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
839
1
                                    "invalid digit '%c' in octal literal", c));
840
1
                        }
841
26
                        else {
842
26
                            tok_backup(tok, c);
843
26
                            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid octal literal"));
844
26
                        }
845
27
                    }
846
4.61k
                    do {
847
4.61k
                        c = tok_nextc(tok);
848
4.61k
                    } while ('0' <= c && c < '8');
849
1.40k
                } while (c == '_');
850
715
                if (Py_ISDIGIT(c)) {
851
2
                    return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
852
2
                            "invalid digit '%c' in octal literal", c));
853
2
                }
854
713
                if (!verify_end_of_number(tok, c, "octal")) {
855
2
                    return MAKE_TOKEN(ERRORTOKEN);
856
2
                }
857
713
            }
858
15.9k
            else if (c == 'b' || c == 'B') {
859
                /* Binary */
860
591
                c = tok_nextc(tok);
861
903
                do {
862
903
                    if (c == '_') {
863
320
                        c = tok_nextc(tok);
864
320
                    }
865
903
                    if (c != '0' && c != '1') {
866
29
                        if (Py_ISDIGIT(c)) {
867
1
                            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid digit '%c' in binary literal", c));
868
1
                        }
869
28
                        else {
870
28
                            tok_backup(tok, c);
871
28
                            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid binary literal"));
872
28
                        }
873
29
                    }
874
4.41k
                    do {
875
4.41k
                        c = tok_nextc(tok);
876
4.41k
                    } while (c == '0' || c == '1');
877
874
                } while (c == '_');
878
562
                if (Py_ISDIGIT(c)) {
879
1
                    return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid digit '%c' in binary literal", c));
880
1
                }
881
561
                if (!verify_end_of_number(tok, c, "binary")) {
882
1
                    return MAKE_TOKEN(ERRORTOKEN);
883
1
                }
884
561
            }
885
15.3k
            else {
886
15.3k
                int nonzero = 0;
887
                /* maybe old-style octal; c is first char of it */
888
                /* in any case, allow '0' as a literal */
889
17.0k
                while (1) {
890
17.0k
                    if (c == '_') {
891
209
                        c = tok_nextc(tok);
892
209
                        if (!Py_ISDIGIT(c)) {
893
3
                            tok_backup(tok, c);
894
3
                            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal"));
895
3
                        }
896
209
                    }
897
17.0k
                    if (c != '0') {
898
15.3k
                        break;
899
15.3k
                    }
900
1.71k
                    c = tok_nextc(tok);
901
1.71k
                }
902
15.3k
                char* zeros_end = tok->cur;
903
15.3k
                if (Py_ISDIGIT(c)) {
904
499
                    nonzero = 1;
905
499
                    c = tok_decimal_tail(tok);
906
499
                    if (c == 0) {
907
3
                        return MAKE_TOKEN(ERRORTOKEN);
908
3
                    }
909
499
                }
910
15.3k
                if (c == '.') {
911
792
                    c = tok_nextc(tok);
912
792
                    goto fraction;
913
792
                }
914
14.5k
                else if (c == 'e' || c == 'E') {
915
963
                    goto exponent;
916
963
                }
917
13.5k
                else if (c == 'j' || c == 'J') {
918
810
                    goto imaginary;
919
810
                }
920
12.7k
                else if (nonzero && !tok->tok_extra_tokens) {
921
                    /* Old-style octal: now disallowed. */
922
24
                    tok_backup(tok, c);
923
24
                    return MAKE_TOKEN(_PyTokenizer_syntaxerror_known_range(
924
24
                            tok, (int)(tok->start + 1 - tok->line_start),
925
24
                            (int)(zeros_end - tok->line_start),
926
24
                            "leading zeros in decimal integer "
927
24
                            "literals are not permitted; "
928
24
                            "use an 0o prefix for octal integers"));
929
24
                }
930
12.7k
                if (!verify_end_of_number(tok, c, "decimal")) {
931
33
                    return MAKE_TOKEN(ERRORTOKEN);
932
33
                }
933
12.7k
            }
934
32.6k
        }
935
62.8k
        else {
936
            /* Decimal */
937
62.8k
            c = tok_decimal_tail(tok);
938
62.8k
            if (c == 0) {
939
10
                return MAKE_TOKEN(ERRORTOKEN);
940
10
            }
941
62.8k
            {
942
                /* Accept floating-point numbers. */
943
62.8k
                if (c == '.') {
944
3.81k
                    c = tok_nextc(tok);
945
7.87k
        fraction:
946
                    /* Fraction */
947
7.87k
                    if (Py_ISDIGIT(c)) {
948
6.18k
                        c = tok_decimal_tail(tok);
949
6.18k
                        if (c == 0) {
950
1
                            return MAKE_TOKEN(ERRORTOKEN);
951
1
                        }
952
6.18k
                    }
953
7.87k
                }
954
66.9k
                if (c == 'e' || c == 'E') {
955
10.0k
                    int e;
956
11.0k
                  exponent:
957
11.0k
                    e = c;
958
                    /* Exponent part */
959
11.0k
                    c = tok_nextc(tok);
960
11.0k
                    if (c == '+' || c == '-') {
961
3.55k
                        c = tok_nextc(tok);
962
3.55k
                        if (!Py_ISDIGIT(c)) {
963
9
                            tok_backup(tok, c);
964
9
                            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal"));
965
9
                        }
966
7.48k
                    } else if (!Py_ISDIGIT(c)) {
967
478
                        tok_backup(tok, c);
968
478
                        if (!verify_end_of_number(tok, e, "decimal")) {
969
50
                            return MAKE_TOKEN(ERRORTOKEN);
970
50
                        }
971
428
                        tok_backup(tok, e);
972
428
                        p_start = tok->start;
973
428
                        p_end = tok->cur;
974
428
                        return MAKE_TOKEN(NUMBER);
975
478
                    }
976
10.5k
                    c = tok_decimal_tail(tok);
977
10.5k
                    if (c == 0) {
978
1
                        return MAKE_TOKEN(ERRORTOKEN);
979
1
                    }
980
10.5k
                }
981
67.3k
                if (c == 'j' || c == 'J') {
982
                    /* Imaginary part */
983
4.48k
        imaginary:
984
4.48k
                    c = tok_nextc(tok);
985
4.48k
                    if (!verify_end_of_number(tok, c, "imaginary")) {
986
10
                        return MAKE_TOKEN(ERRORTOKEN);
987
10
                    }
988
4.48k
                }
989
63.6k
                else if (!verify_end_of_number(tok, c, "decimal")) {
990
114
                    return MAKE_TOKEN(ERRORTOKEN);
991
114
                }
992
67.3k
            }
993
67.3k
        }
994
97.9k
        tok_backup(tok, c);
995
97.9k
        p_start = tok->start;
996
97.9k
        p_end = tok->cur;
997
97.9k
        return MAKE_TOKEN(NUMBER);
998
95.4k
    }
999
1000
916k
  f_string_quote:
1001
916k
    if (((Py_TOLOWER(*tok->start) == 'f' || Py_TOLOWER(*tok->start) == 'r' || Py_TOLOWER(*tok->start) == 't')
1002
916k
        && (c == '\'' || c == '"'))) {
1003
1004
16.8k
        int quote = c;
1005
16.8k
        int quote_size = 1;             /* 1 or 3 */
1006
1007
        /* Nodes of type STRING, especially multi line strings
1008
           must be handled differently in order to get both
1009
           the starting line number and the column offset right.
1010
           (cf. issue 16806) */
1011
16.8k
        tok->first_lineno = tok->lineno;
1012
16.8k
        tok->multi_line_start = tok->line_start;
1013
1014
        /* Find the quote size and start of string */
1015
16.8k
        int after_quote = tok_nextc(tok);
1016
16.8k
        if (after_quote == quote) {
1017
2.34k
            int after_after_quote = tok_nextc(tok);
1018
2.34k
            if (after_after_quote == quote) {
1019
791
                quote_size = 3;
1020
791
            }
1021
1.55k
            else {
1022
                // TODO: Check this
1023
1.55k
                tok_backup(tok, after_after_quote);
1024
1.55k
                tok_backup(tok, after_quote);
1025
1.55k
            }
1026
2.34k
        }
1027
16.8k
        if (after_quote != quote) {
1028
14.4k
            tok_backup(tok, after_quote);
1029
14.4k
        }
1030
1031
1032
16.8k
        p_start = tok->start;
1033
16.8k
        p_end = tok->cur;
1034
16.8k
        if (tok->tok_mode_stack_index + 1 >= MAXFSTRINGLEVEL) {
1035
2
            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "too many nested f-strings or t-strings"));
1036
2
        }
1037
16.8k
        tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok);
1038
16.8k
        the_current_tok->kind = TOK_FSTRING_MODE;
1039
16.8k
        the_current_tok->quote = quote;
1040
16.8k
        the_current_tok->quote_size = quote_size;
1041
16.8k
        the_current_tok->start = tok->start;
1042
16.8k
        the_current_tok->multi_line_start = tok->line_start;
1043
16.8k
        the_current_tok->first_line = tok->lineno;
1044
16.8k
        the_current_tok->start_offset = -1;
1045
16.8k
        the_current_tok->multi_line_start_offset = -1;
1046
16.8k
        the_current_tok->last_expr_buffer = NULL;
1047
16.8k
        the_current_tok->last_expr_size = 0;
1048
16.8k
        the_current_tok->last_expr_end = -1;
1049
16.8k
        the_current_tok->in_format_spec = 0;
1050
16.8k
        the_current_tok->in_debug = 0;
1051
1052
16.8k
        enum string_kind_t string_kind = FSTRING;
1053
16.8k
        switch (*tok->start) {
1054
784
            case 'T':
1055
3.41k
            case 't':
1056
3.41k
                the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r';
1057
3.41k
                string_kind = TSTRING;
1058
3.41k
                break;
1059
2.08k
            case 'F':
1060
12.7k
            case 'f':
1061
12.7k
                the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r';
1062
12.7k
                break;
1063
284
            case 'R':
1064
682
            case 'r':
1065
682
                the_current_tok->raw = 1;
1066
682
                if (Py_TOLOWER(*(tok->start + 1)) == 't') {
1067
209
                    string_kind = TSTRING;
1068
209
                }
1069
682
                break;
1070
0
            default:
1071
0
                Py_UNREACHABLE();
1072
16.8k
        }
1073
1074
16.8k
        the_current_tok->string_kind = string_kind;
1075
16.8k
        the_current_tok->curly_bracket_depth = 0;
1076
16.8k
        the_current_tok->curly_bracket_expr_start_depth = -1;
1077
16.8k
        return string_kind == TSTRING ? MAKE_TOKEN(TSTRING_START) : MAKE_TOKEN(FSTRING_START);
1078
16.8k
    }
1079
1080
904k
  letter_quote:
1081
    /* String */
1082
904k
    if (c == '\'' || c == '"') {
1083
56.5k
        int quote = c;
1084
56.5k
        int quote_size = 1;             /* 1 or 3 */
1085
56.5k
        int end_quote_size = 0;
1086
56.5k
        int has_escaped_quote = 0;
1087
1088
        /* Nodes of type STRING, especially multi line strings
1089
           must be handled differently in order to get both
1090
           the starting line number and the column offset right.
1091
           (cf. issue 16806) */
1092
56.5k
        tok->first_lineno = tok->lineno;
1093
56.5k
        tok->multi_line_start = tok->line_start;
1094
1095
        /* Find the quote size and start of string */
1096
56.5k
        c = tok_nextc(tok);
1097
56.5k
        if (c == quote) {
1098
10.0k
            c = tok_nextc(tok);
1099
10.0k
            if (c == quote) {
1100
2.43k
                quote_size = 3;
1101
2.43k
            }
1102
7.60k
            else {
1103
7.60k
                end_quote_size = 1;     /* empty string found */
1104
7.60k
            }
1105
10.0k
        }
1106
56.5k
        if (c != quote) {
1107
54.1k
            tok_backup(tok, c);
1108
54.1k
        }
1109
1110
        /* Get rest of string */
1111
1.13M
        while (end_quote_size != quote_size) {
1112
1.07M
            c = tok_nextc(tok);
1113
1.07M
            if (tok->done == E_ERROR) {
1114
0
                return MAKE_TOKEN(ERRORTOKEN);
1115
0
            }
1116
1.07M
            if (tok->done == E_DECODE) {
1117
0
                break;
1118
0
            }
1119
1.07M
            if (c == EOF || (quote_size == 1 && c == '\n')) {
1120
440
                assert(tok->multi_line_start != NULL);
1121
                // shift the tok_state's location into
1122
                // the start of string, and report the error
1123
                // from the initial quote character
1124
440
                tok->cur = (char *)tok->start;
1125
440
                tok->cur++;
1126
440
                tok->line_start = tok->multi_line_start;
1127
440
                int start = tok->lineno;
1128
440
                tok->lineno = tok->first_lineno;
1129
1130
440
                if (INSIDE_FSTRING(tok)) {
1131
                    /* When we are in an f-string, before raising the
1132
                     * unterminated string literal error, check whether
1133
                     * does the initial quote matches with f-strings quotes
1134
                     * and if it is, then this must be a missing '}' token
1135
                     * so raise the proper error */
1136
24
                    tokenizer_mode *the_current_tok = TOK_GET_MODE(tok);
1137
24
                    if (the_current_tok->quote == quote &&
1138
24
                        the_current_tok->quote_size == quote_size) {
1139
11
                        return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1140
11
                            "%c-string: expecting '}'", TOK_GET_STRING_PREFIX(tok)));
1141
11
                    }
1142
24
                }
1143
1144
429
                if (quote_size == 3) {
1145
15
                    _PyTokenizer_syntaxerror(tok, "unterminated triple-quoted string literal"
1146
15
                                     " (detected at line %d)", start);
1147
15
                    if (c != '\n') {
1148
15
                        tok->done = E_EOFS;
1149
15
                    }
1150
15
                    return MAKE_TOKEN(ERRORTOKEN);
1151
15
                }
1152
414
                else {
1153
414
                    if (has_escaped_quote) {
1154
12
                        _PyTokenizer_syntaxerror(
1155
12
                            tok,
1156
12
                            "unterminated string literal (detected at line %d); "
1157
12
                            "perhaps you escaped the end quote?",
1158
12
                            start
1159
12
                        );
1160
402
                    } else {
1161
402
                        _PyTokenizer_syntaxerror(
1162
402
                            tok, "unterminated string literal (detected at line %d)", start
1163
402
                        );
1164
402
                    }
1165
414
                    if (c != '\n') {
1166
15
                        tok->done = E_EOLS;
1167
15
                    }
1168
414
                    return MAKE_TOKEN(ERRORTOKEN);
1169
414
                }
1170
429
            }
1171
1.07M
            if (c == quote) {
1172
54.8k
                end_quote_size += 1;
1173
54.8k
            }
1174
1.02M
            else {
1175
1.02M
                end_quote_size = 0;
1176
1.02M
                if (c == '\\') {
1177
32.6k
                    c = tok_nextc(tok);  /* skip escaped char */
1178
32.6k
                    if (c == quote) {  /* but record whether the escaped char was a quote */
1179
1.50k
                        has_escaped_quote = 1;
1180
1.50k
                    }
1181
32.6k
                    if (c == '\r') {
1182
205
                        c = tok_nextc(tok);
1183
205
                    }
1184
32.6k
                }
1185
1.02M
            }
1186
1.07M
        }
1187
1188
56.1k
        p_start = tok->start;
1189
56.1k
        p_end = tok->cur;
1190
56.1k
        return MAKE_TOKEN(STRING);
1191
56.5k
    }
1192
1193
    /* Line continuation */
1194
848k
    if (c == '\\') {
1195
427
        if ((c = tok_continuation_line(tok)) == -1) {
1196
72
            return MAKE_TOKEN(ERRORTOKEN);
1197
72
        }
1198
355
        tok->cont_line = 1;
1199
355
        goto again; /* Read next line */
1200
427
    }
1201
1202
    /* Punctuation character */
1203
847k
    int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{');
1204
847k
    if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) {
1205
        /* This code block gets executed before the curly_bracket_depth is incremented
1206
         * by the `{` case, so for ensuring that we are on the 0th level, we need
1207
         * to adjust it manually */
1208
53.6k
        int cursor = current_tok->curly_bracket_depth - (c != '{');
1209
53.6k
        int in_format_spec = current_tok->in_format_spec;
1210
53.6k
         int cursor_in_format_with_debug =
1211
53.6k
             cursor == 1 && (current_tok->in_debug || in_format_spec);
1212
53.6k
         int cursor_valid = cursor == 0 || cursor_in_format_with_debug;
1213
53.6k
        if ((cursor_valid) && !_PyLexer_update_ftstring_expr(tok, c)) {
1214
0
            return MAKE_TOKEN(ENDMARKER);
1215
0
        }
1216
53.6k
        if ((cursor_valid) && c != '{' && set_ftstring_expr(tok, token, c)) {
1217
5
            return MAKE_TOKEN(ERRORTOKEN);
1218
5
        }
1219
1220
53.6k
        if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) {
1221
3.80k
            current_tok->kind = TOK_FSTRING_MODE;
1222
3.80k
            current_tok->in_format_spec = 1;
1223
3.80k
            p_start = tok->start;
1224
3.80k
            p_end = tok->cur;
1225
3.80k
            return MAKE_TOKEN(_PyToken_OneChar(c));
1226
3.80k
        }
1227
53.6k
    }
1228
1229
    /* Check for two-character token */
1230
844k
    {
1231
844k
        int c2 = tok_nextc(tok);
1232
844k
        int current_token = _PyToken_TwoChars(c, c2);
1233
844k
        if (current_token != OP) {
1234
22.7k
            int c3 = tok_nextc(tok);
1235
22.7k
            int current_token3 = _PyToken_ThreeChars(c, c2, c3);
1236
22.7k
            if (current_token3 != OP) {
1237
1.21k
                current_token = current_token3;
1238
1.21k
            }
1239
21.5k
            else {
1240
21.5k
                tok_backup(tok, c3);
1241
21.5k
            }
1242
22.7k
            p_start = tok->start;
1243
22.7k
            p_end = tok->cur;
1244
22.7k
            return MAKE_TOKEN(current_token);
1245
22.7k
        }
1246
821k
        tok_backup(tok, c2);
1247
821k
    }
1248
1249
    /* Keep track of parentheses nesting level */
1250
0
    switch (c) {
1251
90.1k
    case '(':
1252
126k
    case '[':
1253
171k
    case '{':
1254
171k
        if (tok->level >= MAXLEVEL) {
1255
3
            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "too many nested parentheses"));
1256
3
        }
1257
171k
        tok->parenstack[tok->level] = c;
1258
171k
        tok->parenlinenostack[tok->level] = tok->lineno;
1259
171k
        tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
1260
171k
        tok->level++;
1261
171k
        if (INSIDE_FSTRING(tok)) {
1262
30.5k
            current_tok->curly_bracket_depth++;
1263
30.5k
        }
1264
171k
        break;
1265
58.0k
    case ')':
1266
70.0k
    case ']':
1267
96.9k
    case '}':
1268
96.9k
        if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') {
1269
48
            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1270
48
                "%c-string: single '}' is not allowed", TOK_GET_STRING_PREFIX(tok)));
1271
48
        }
1272
96.9k
        if (!tok->tok_extra_tokens && !tok->level) {
1273
215
            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "unmatched '%c'", c));
1274
215
        }
1275
96.7k
        if (tok->level > 0) {
1276
96.7k
            tok->level--;
1277
96.7k
            int opening = tok->parenstack[tok->level];
1278
96.7k
            if (!tok->tok_extra_tokens && !((opening == '(' && c == ')') ||
1279
96.7k
                                            (opening == '[' && c == ']') ||
1280
96.7k
                                            (opening == '{' && c == '}'))) {
1281
                /* If the opening bracket belongs to an f-string's expression
1282
                part (e.g. f"{)}") and the closing bracket is an arbitrary
1283
                nested expression, then instead of matching a different
1284
                syntactical construct with it; we'll throw an unmatched
1285
                parentheses error. */
1286
46
                if (INSIDE_FSTRING(tok) && opening == '{') {
1287
7
                    assert(current_tok->curly_bracket_depth >= 0);
1288
7
                    int previous_bracket = current_tok->curly_bracket_depth - 1;
1289
7
                    if (previous_bracket == current_tok->curly_bracket_expr_start_depth) {
1290
5
                        return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1291
5
                            "%c-string: unmatched '%c'", TOK_GET_STRING_PREFIX(tok), c));
1292
5
                    }
1293
7
                }
1294
41
                if (tok->parenlinenostack[tok->level] != tok->lineno) {
1295
5
                    return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1296
5
                            "closing parenthesis '%c' does not match "
1297
5
                            "opening parenthesis '%c' on line %d",
1298
5
                            c, opening, tok->parenlinenostack[tok->level]));
1299
5
                }
1300
36
                else {
1301
36
                    return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1302
36
                            "closing parenthesis '%c' does not match "
1303
36
                            "opening parenthesis '%c'",
1304
36
                            c, opening));
1305
36
                }
1306
41
            }
1307
96.7k
        }
1308
1309
96.6k
        if (INSIDE_FSTRING(tok)) {
1310
22.8k
            current_tok->curly_bracket_depth--;
1311
22.8k
            if (current_tok->curly_bracket_depth < 0) {
1312
1
                return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "%c-string: unmatched '%c'",
1313
1
                    TOK_GET_STRING_PREFIX(tok), c));
1314
1
            }
1315
22.8k
            if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) {
1316
20.7k
                current_tok->curly_bracket_expr_start_depth--;
1317
20.7k
                current_tok->kind = TOK_FSTRING_MODE;
1318
20.7k
                current_tok->in_format_spec = 0;
1319
20.7k
                current_tok->in_debug = 0;
1320
20.7k
            }
1321
22.8k
        }
1322
96.6k
        break;
1323
552k
    default:
1324
552k
        break;
1325
821k
    }
1326
1327
821k
    if (!Py_UNICODE_ISPRINTABLE(c)) {
1328
480
        return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", c));
1329
480
    }
1330
1331
820k
    if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) {
1332
41.8k
        current_tok->in_debug = 1;
1333
41.8k
    }
1334
1335
    /* Punctuation character */
1336
820k
    p_start = tok->start;
1337
820k
    p_end = tok->cur;
1338
820k
    return MAKE_TOKEN(_PyToken_OneChar(c));
1339
821k
}
1340
1341
static int
1342
tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
1343
54.0k
{
1344
54.0k
    const char *p_start = NULL;
1345
54.0k
    const char *p_end = NULL;
1346
54.0k
    int end_quote_size = 0;
1347
54.0k
    int unicode_escape = 0;
1348
1349
54.0k
    tok->start = tok->cur;
1350
54.0k
    tok->first_lineno = tok->lineno;
1351
54.0k
    tok->starting_col_offset = tok->col_offset;
1352
1353
    // If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize
1354
    // before it.
1355
54.0k
    int start_char = tok_nextc(tok);
1356
54.0k
    if (start_char == '{') {
1357
13.2k
        int peek1 = tok_nextc(tok);
1358
13.2k
        tok_backup(tok, peek1);
1359
13.2k
        tok_backup(tok, start_char);
1360
13.2k
        if (peek1 != '{') {
1361
9.67k
            current_tok->curly_bracket_expr_start_depth++;
1362
9.67k
            if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) {
1363
4
                return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1364
4
                    "%c-string: expressions nested too deeply", TOK_GET_STRING_PREFIX(tok)));
1365
4
            }
1366
9.66k
            TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
1367
9.66k
            return tok_get_normal_mode(tok, current_tok, token);
1368
9.67k
        }
1369
13.2k
    }
1370
40.8k
    else {
1371
40.8k
        tok_backup(tok, start_char);
1372
40.8k
    }
1373
1374
    // Check if we are at the end of the string
1375
63.2k
    for (int i = 0; i < current_tok->quote_size; i++) {
1376
51.0k
        int quote = tok_nextc(tok);
1377
51.0k
        if (quote != current_tok->quote) {
1378
32.1k
            tok_backup(tok, quote);
1379
32.1k
            goto f_string_middle;
1380
32.1k
        }
1381
51.0k
    }
1382
1383
12.2k
    if (current_tok->last_expr_buffer != NULL) {
1384
5.99k
        PyMem_Free(current_tok->last_expr_buffer);
1385
5.99k
        current_tok->last_expr_buffer = NULL;
1386
5.99k
        current_tok->last_expr_size = 0;
1387
5.99k
        current_tok->last_expr_end = -1;
1388
5.99k
    }
1389
1390
12.2k
    p_start = tok->start;
1391
12.2k
    p_end = tok->cur;
1392
12.2k
    tok->tok_mode_stack_index--;
1393
12.2k
    return MAKE_TOKEN(FTSTRING_END(current_tok));
1394
1395
32.1k
f_string_middle:
1396
1397
    // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle
1398
    // this.
1399
32.1k
    tok->multi_line_start = tok->line_start;
1400
160k
    while (end_quote_size != current_tok->quote_size) {
1401
154k
        int c = tok_nextc(tok);
1402
154k
        if (tok->done == E_ERROR || tok->done == E_DECODE) {
1403
0
            return MAKE_TOKEN(ERRORTOKEN);
1404
0
        }
1405
154k
        int in_format_spec = (
1406
154k
                current_tok->in_format_spec
1407
154k
                &&
1408
154k
                INSIDE_FSTRING_EXPR(current_tok)
1409
154k
        );
1410
1411
154k
       if (c == EOF || (current_tok->quote_size == 1 && c == '\n')) {
1412
464
            if (tok->decoding_erred) {
1413
0
                return MAKE_TOKEN(ERRORTOKEN);
1414
0
            }
1415
1416
            // If we are in a format spec and we found a newline,
1417
            // it means that the format spec ends here and we should
1418
            // return to the regular mode.
1419
464
            if (in_format_spec && c == '\n') {
1420
75
                if (current_tok->quote_size == 1) {
1421
75
                    return MAKE_TOKEN(
1422
75
                        _PyTokenizer_syntaxerror(
1423
75
                            tok,
1424
75
                            "%c-string: newlines are not allowed in format specifiers for single quoted %c-strings",
1425
75
                            TOK_GET_STRING_PREFIX(tok), TOK_GET_STRING_PREFIX(tok)
1426
75
                        )
1427
75
                    );
1428
75
                }
1429
0
                tok_backup(tok, c);
1430
0
                TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
1431
0
                current_tok->in_format_spec = 0;
1432
0
                p_start = tok->start;
1433
0
                p_end = tok->cur;
1434
0
                return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok));
1435
75
            }
1436
1437
389
            assert(tok->multi_line_start != NULL);
1438
            // shift the tok_state's location into
1439
            // the start of string, and report the error
1440
            // from the initial quote character
1441
389
            tok->cur = (char *)current_tok->start;
1442
389
            tok->cur++;
1443
389
            tok->line_start = current_tok->multi_line_start;
1444
389
            int start = tok->lineno;
1445
1446
389
            tokenizer_mode *the_current_tok = TOK_GET_MODE(tok);
1447
389
            tok->lineno = the_current_tok->first_line;
1448
1449
389
            if (current_tok->quote_size == 3) {
1450
32
                _PyTokenizer_syntaxerror(tok,
1451
32
                                    "unterminated triple-quoted %c-string literal"
1452
32
                                    " (detected at line %d)",
1453
32
                                    TOK_GET_STRING_PREFIX(tok), start);
1454
32
                if (c != '\n') {
1455
32
                    tok->done = E_EOFS;
1456
32
                }
1457
32
                return MAKE_TOKEN(ERRORTOKEN);
1458
32
            }
1459
357
            else {
1460
357
                return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1461
357
                                    "unterminated %c-string literal (detected at"
1462
357
                                    " line %d)", TOK_GET_STRING_PREFIX(tok), start));
1463
357
            }
1464
389
        }
1465
1466
154k
        if (c == current_tok->quote) {
1467
8.64k
            end_quote_size += 1;
1468
8.64k
            continue;
1469
145k
        } else {
1470
145k
            end_quote_size = 0;
1471
145k
        }
1472
1473
145k
        if (c == '{') {
1474
20.3k
            if (!_PyLexer_update_ftstring_expr(tok, c)) {
1475
0
                return MAKE_TOKEN(ENDMARKER);
1476
0
            }
1477
20.3k
            int peek = tok_nextc(tok);
1478
20.3k
            if (peek != '{' || in_format_spec) {
1479
16.3k
                tok_backup(tok, peek);
1480
16.3k
                tok_backup(tok, c);
1481
16.3k
                current_tok->curly_bracket_expr_start_depth++;
1482
16.3k
                if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) {
1483
5
                    return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1484
5
                        "%c-string: expressions nested too deeply", TOK_GET_STRING_PREFIX(tok)));
1485
5
                }
1486
16.3k
                TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
1487
16.3k
                current_tok->in_format_spec = 0;
1488
16.3k
                p_start = tok->start;
1489
16.3k
                p_end = tok->cur;
1490
16.3k
            } else {
1491
4.01k
                p_start = tok->start;
1492
4.01k
                p_end = tok->cur - 1;
1493
4.01k
            }
1494
20.3k
            return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok));
1495
125k
        } else if (c == '}') {
1496
5.02k
            if (unicode_escape) {
1497
496
                p_start = tok->start;
1498
496
                p_end = tok->cur;
1499
496
                return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok));
1500
496
            }
1501
4.52k
            int peek = tok_nextc(tok);
1502
1503
            // The tokenizer can only be in the format spec if we have already completed the expression
1504
            // scanning (indicated by the end of the expression being set) and we are not at the top level
1505
            // of the bracket stack (-1 is the top level). Since format specifiers can't legally use double
1506
            // brackets, we can bypass it here.
1507
4.52k
            int cursor = current_tok->curly_bracket_depth;
1508
4.52k
            if (peek == '}' && !in_format_spec && cursor == 0) {
1509
1.82k
                p_start = tok->start;
1510
1.82k
                p_end = tok->cur - 1;
1511
2.70k
            } else {
1512
2.70k
                tok_backup(tok, peek);
1513
2.70k
                tok_backup(tok, c);
1514
2.70k
                TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
1515
2.70k
                current_tok->in_format_spec = 0;
1516
2.70k
                p_start = tok->start;
1517
2.70k
                p_end = tok->cur;
1518
2.70k
            }
1519
4.52k
            return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok));
1520
120k
        } else if (c == '\\') {
1521
6.84k
            int peek = tok_nextc(tok);
1522
6.84k
            if (peek == '\r') {
1523
69
                peek = tok_nextc(tok);
1524
69
            }
1525
            // Special case when the backslash is right before a curly
1526
            // brace. We have to restore and return the control back
1527
            // to the loop for the next iteration.
1528
6.84k
            if (peek == '{' || peek == '}') {
1529
1.34k
                if (!current_tok->raw) {
1530
1.15k
                    if (_PyTokenizer_warn_invalid_escape_sequence(tok, peek)) {
1531
0
                        return MAKE_TOKEN(ERRORTOKEN);
1532
0
                    }
1533
1.15k
                }
1534
1.34k
                tok_backup(tok, peek);
1535
1.34k
                continue;
1536
1.34k
            }
1537
1538
5.50k
            if (!current_tok->raw) {
1539
5.11k
                if (peek == 'N') {
1540
                    /* Handle named unicode escapes (\N{BULLET}) */
1541
740
                    peek = tok_nextc(tok);
1542
740
                    if (peek == '{') {
1543
523
                        unicode_escape = 1;
1544
523
                    } else {
1545
217
                        tok_backup(tok, peek);
1546
217
                    }
1547
740
                }
1548
5.11k
            } /* else {
1549
                skip the escaped character
1550
            }*/
1551
5.50k
        }
1552
145k
    }
1553
1554
    // Backup the f-string quotes to emit a final FSTRING_MIDDLE and
1555
    // add the quotes to the FSTRING_END in the next tokenizer iteration.
1556
13.5k
    for (int i = 0; i < current_tok->quote_size; i++) {
1557
7.22k
        tok_backup(tok, current_tok->quote);
1558
7.22k
    }
1559
6.35k
    p_start = tok->start;
1560
6.35k
    p_end = tok->cur;
1561
6.35k
    return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok));
1562
32.1k
}
1563
1564
static int
1565
tok_get(struct tok_state *tok, struct token *token)
1566
1.77M
{
1567
1.77M
    tokenizer_mode *current_tok = TOK_GET_MODE(tok);
1568
1.77M
    if (current_tok->kind == TOK_REGULAR_MODE) {
1569
1.72M
        return tok_get_normal_mode(tok, current_tok, token);
1570
1.72M
    } else {
1571
54.0k
        return tok_get_fstring_mode(tok, current_tok, token);
1572
54.0k
    }
1573
1.77M
}
1574
1575
int
1576
_PyTokenizer_Get(struct tok_state *tok, struct token *token)
1577
1.77M
{
1578
1.77M
    int result = tok_get(tok, token);
1579
1.77M
    if (tok->decoding_erred) {
1580
0
        result = ERRORTOKEN;
1581
0
        tok->done = E_DECODE;
1582
0
    }
1583
1.77M
    return result;
1584
1.77M
}