Coverage Report

Created: 2025-07-11 06:24

/src/cpython/Parser/lexer/lexer.c
Line
Count
Source (jump to first uncovered line)
1
#include "Python.h"
2
#include "pycore_token.h"
3
#include "pycore_unicodeobject.h"
4
#include "errcode.h"
5
6
#include "state.h"
7
#include "../tokenizer/helpers.h"
8
9
/* Alternate tab spacing */
10
8.10k
#define ALTTABSIZE 1
11
12
1.73M
#define is_potential_identifier_start(c) (\
13
1.73M
              (c >= 'a' && c <= 'z')\
14
1.73M
               || (c >= 'A' && c <= 'Z')\
15
1.73M
               || c == '_'\
16
1.73M
               || (c >= 128))
17
18
2.29M
#define is_potential_identifier_char(c) (\
19
2.29M
              (c >= 'a' && c <= 'z')\
20
2.29M
               || (c >= 'A' && c <= 'Z')\
21
2.29M
               || (c >= '0' && c <= '9')\
22
2.29M
               || c == '_'\
23
2.29M
               || (c >= 128))
24
25
#ifdef Py_DEBUG
26
static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) {
27
    assert(tok->tok_mode_stack_index >= 0);
28
    assert(tok->tok_mode_stack_index < MAXFSTRINGLEVEL);
29
    return &(tok->tok_mode_stack[tok->tok_mode_stack_index]);
30
}
31
static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) {
32
    assert(tok->tok_mode_stack_index >= 0);
33
    assert(tok->tok_mode_stack_index + 1 < MAXFSTRINGLEVEL);
34
    return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]);
35
}
36
#else
37
1.86M
#define TOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index]))
38
16.9k
#define TOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index]))
39
#endif
40
41
#define FTSTRING_MIDDLE(tok_mode) (tok_mode->string_kind == TSTRING ? TSTRING_MIDDLE : FSTRING_MIDDLE)
42
#define FTSTRING_END(tok_mode) (tok_mode->string_kind == TSTRING ? TSTRING_END : FSTRING_END)
43
31
#define TOK_GET_STRING_PREFIX(tok) (TOK_GET_MODE(tok)->string_kind == TSTRING ? 't' : 'f')
44
1.74M
#define MAKE_TOKEN(token_type) _PyLexer_token_setup(tok, token, token_type, p_start, p_end)
45
0
#define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\
46
0
                _PyLexer_type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end))
47
48
/* Spaces in this constant are treated as "zero or more spaces or tabs" when
49
   tokenizing. */
50
static const char* type_comment_prefix = "# type: ";
51
52
static inline int
53
contains_null_bytes(const char* str, size_t size)
54
219k
{
55
219k
    return memchr(str, 0, size) != NULL;
56
219k
}
57
58
/* Get next char, updating state; error code goes into tok->done */
59
static int
60
tok_nextc(struct tok_state *tok)
61
10.5M
{
62
10.5M
    int rc;
63
10.7M
    for (;;) {
64
10.7M
        if (tok->cur != tok->inp) {
65
10.4M
            if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) {
66
0
                tok->done = E_COLUMNOVERFLOW;
67
0
                return EOF;
68
0
            }
69
10.4M
            tok->col_offset++;
70
10.4M
            return Py_CHARMASK(*tok->cur++); /* Fast path */
71
10.4M
        }
72
271k
        if (tok->done != E_OK) {
73
34.6k
            return EOF;
74
34.6k
        }
75
237k
        rc = tok->underflow(tok);
76
#if defined(Py_DEBUG)
77
        if (tok->debug) {
78
            fprintf(stderr, "line[%d] = ", tok->lineno);
79
            _PyTokenizer_print_escape(stderr, tok->cur, tok->inp - tok->cur);
80
            fprintf(stderr, "  tok->done = %d\n", tok->done);
81
        }
82
#endif
83
237k
        if (!rc) {
84
17.4k
            tok->cur = tok->inp;
85
17.4k
            return EOF;
86
17.4k
        }
87
219k
        tok->line_start = tok->cur;
88
89
219k
        if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) {
90
0
            _PyTokenizer_syntaxerror(tok, "source code cannot contain null bytes");
91
0
            tok->cur = tok->inp;
92
0
            return EOF;
93
0
        }
94
219k
    }
95
10.5M
    Py_UNREACHABLE();
96
10.5M
}
97
98
/* Back-up one character */
99
static void
100
tok_backup(struct tok_state *tok, int c)
101
3.68M
{
102
3.68M
    if (c != EOF) {
103
3.64M
        if (--tok->cur < tok->buf) {
104
0
            Py_FatalError("tokenizer beginning of buffer");
105
0
        }
106
3.64M
        if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) {
107
0
            Py_FatalError("tok_backup: wrong character");
108
0
        }
109
3.64M
        tok->col_offset--;
110
3.64M
    }
111
3.68M
}
112
113
static int
114
21.8k
set_ftstring_expr(struct tok_state* tok, struct token *token, char c) {
115
21.8k
    assert(token != NULL);
116
21.8k
    assert(c == '}' || c == ':' || c == '!');
117
21.8k
    tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
118
119
21.8k
    if (!(tok_mode->in_debug || tok_mode->string_kind == TSTRING) || token->metadata) {
120
13.0k
        return 0;
121
13.0k
    }
122
8.80k
    PyObject *res = NULL;
123
124
    // Check if there is a # character in the expression
125
8.80k
    int hash_detected = 0;
126
1.56M
    for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) {
127
1.55M
        if (tok_mode->last_expr_buffer[i] == '#') {
128
1.07k
            hash_detected = 1;
129
1.07k
            break;
130
1.07k
        }
131
1.55M
    }
132
133
8.80k
    if (hash_detected) {
134
1.07k
        Py_ssize_t input_length = tok_mode->last_expr_size - tok_mode->last_expr_end;
135
1.07k
        char *result = (char *)PyMem_Malloc((input_length + 1) * sizeof(char));
136
1.07k
        if (!result) {
137
0
            return -1;
138
0
        }
139
140
1.07k
        Py_ssize_t i = 0;
141
1.07k
        Py_ssize_t j = 0;
142
143
41.7k
        for (i = 0, j = 0; i < input_length; i++) {
144
40.6k
            if (tok_mode->last_expr_buffer[i] == '#') {
145
                // Skip characters until newline or end of string
146
19.8k
                while (i < input_length && tok_mode->last_expr_buffer[i] != '\0') {
147
18.9k
                    if (tok_mode->last_expr_buffer[i] == '\n') {
148
287
                        result[j++] = tok_mode->last_expr_buffer[i];
149
287
                        break;
150
287
                    }
151
18.6k
                    i++;
152
18.6k
                }
153
39.3k
            } else {
154
39.3k
                result[j++] = tok_mode->last_expr_buffer[i];
155
39.3k
            }
156
40.6k
        }
157
158
1.07k
        result[j] = '\0';  // Null-terminate the result string
159
1.07k
        res = PyUnicode_DecodeUTF8(result, j, NULL);
160
1.07k
        PyMem_Free(result);
161
7.72k
    } else {
162
7.72k
        res = PyUnicode_DecodeUTF8(
163
7.72k
            tok_mode->last_expr_buffer,
164
7.72k
            tok_mode->last_expr_size - tok_mode->last_expr_end,
165
7.72k
            NULL
166
7.72k
        );
167
168
7.72k
    }
169
170
171
8.80k
   if (!res) {
172
6
        return -1;
173
6
    }
174
8.79k
    token->metadata = res;
175
8.79k
    return 0;
176
8.80k
}
177
178
int
179
_PyLexer_update_ftstring_expr(struct tok_state *tok, char cur)
180
64.5k
{
181
64.5k
    assert(tok->cur != NULL);
182
183
64.5k
    Py_ssize_t size = strlen(tok->cur);
184
64.5k
    tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
185
186
64.5k
    switch (cur) {
187
0
       case 0:
188
0
            if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) {
189
0
                return 1;
190
0
            }
191
0
            char *new_buffer = PyMem_Realloc(
192
0
                tok_mode->last_expr_buffer,
193
0
                tok_mode->last_expr_size + size
194
0
            );
195
0
            if (new_buffer == NULL) {
196
0
                PyMem_Free(tok_mode->last_expr_buffer);
197
0
                goto error;
198
0
            }
199
0
            tok_mode->last_expr_buffer = new_buffer;
200
0
            strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size);
201
0
            tok_mode->last_expr_size += size;
202
0
            break;
203
42.6k
        case '{':
204
42.6k
            if (tok_mode->last_expr_buffer != NULL) {
205
31.8k
                PyMem_Free(tok_mode->last_expr_buffer);
206
31.8k
            }
207
42.6k
            tok_mode->last_expr_buffer = PyMem_Malloc(size);
208
42.6k
            if (tok_mode->last_expr_buffer == NULL) {
209
0
                goto error;
210
0
            }
211
42.6k
            tok_mode->last_expr_size = size;
212
42.6k
            tok_mode->last_expr_end = -1;
213
42.6k
            strncpy(tok_mode->last_expr_buffer, tok->cur, size);
214
42.6k
            break;
215
17.6k
        case '}':
216
19.1k
        case '!':
217
19.1k
            tok_mode->last_expr_end = strlen(tok->start);
218
19.1k
            break;
219
2.77k
        case ':':
220
2.77k
            if (tok_mode->last_expr_end == -1) {
221
2.54k
               tok_mode->last_expr_end = strlen(tok->start);
222
2.54k
            }
223
2.77k
            break;
224
0
        default:
225
0
            Py_UNREACHABLE();
226
64.5k
    }
227
64.5k
    return 1;
228
0
error:
229
0
    tok->done = E_NOMEM;
230
0
    return 0;
231
64.5k
}
232
233
static int
234
lookahead(struct tok_state *tok, const char *test)
235
9.23k
{
236
9.23k
    const char *s = test;
237
9.23k
    int res = 0;
238
24.3k
    while (1) {
239
24.3k
        int c = tok_nextc(tok);
240
24.3k
        if (*s == 0) {
241
9.13k
            res = !is_potential_identifier_char(c);
242
9.13k
        }
243
15.2k
        else if (c == *s) {
244
15.1k
            s++;
245
15.1k
            continue;
246
15.1k
        }
247
248
9.23k
        tok_backup(tok, c);
249
24.3k
        while (s != test) {
250
15.1k
            tok_backup(tok, *--s);
251
15.1k
        }
252
9.23k
        return res;
253
24.3k
    }
254
9.23k
}
255
256
static int
257
97.6k
verify_end_of_number(struct tok_state *tok, int c, const char *kind) {
258
97.6k
    if (tok->tok_extra_tokens) {
259
        // When we are parsing extra tokens, we don't want to emit warnings
260
        // about invalid literals, because we want to be a bit more liberal.
261
0
        return 1;
262
0
    }
263
    /* Emit a deprecation warning only if the numeric literal is immediately
264
     * followed by one of keywords which can occur after a numeric literal
265
     * in valid code: "and", "else", "for", "if", "in", "is" and "or".
266
     * It allows to gradually deprecate existing valid code without adding
267
     * warning before error in most cases of invalid numeric literal (which
268
     * would be confusing and break existing tests).
269
     * Raise a syntax error with slightly better message than plain
270
     * "invalid syntax" if the numeric literal is immediately followed by
271
     * other keyword or identifier.
272
     */
273
97.6k
    int r = 0;
274
97.6k
    if (c == 'a') {
275
832
        r = lookahead(tok, "nd");
276
832
    }
277
96.8k
    else if (c == 'e') {
278
547
        r = lookahead(tok, "lse");
279
547
    }
280
96.3k
    else if (c == 'f') {
281
3.68k
        r = lookahead(tok, "or");
282
3.68k
    }
283
92.6k
    else if (c == 'i') {
284
2.12k
        int c2 = tok_nextc(tok);
285
2.12k
        if (c2 == 'f' || c2 == 'n' || c2 == 's') {
286
2.11k
            r = 1;
287
2.11k
        }
288
2.12k
        tok_backup(tok, c2);
289
2.12k
    }
290
90.5k
    else if (c == 'o') {
291
3.65k
        r = lookahead(tok, "r");
292
3.65k
    }
293
86.8k
    else if (c == 'n') {
294
519
        r = lookahead(tok, "ot");
295
519
    }
296
97.6k
    if (r) {
297
11.2k
        tok_backup(tok, c);
298
11.2k
        if (_PyTokenizer_parser_warn(tok, PyExc_SyntaxWarning,
299
11.2k
                "invalid %s literal", kind))
300
0
        {
301
0
            return 0;
302
0
        }
303
11.2k
        tok_nextc(tok);
304
11.2k
    }
305
86.4k
    else /* In future releases, only error will remain. */
306
86.4k
    if (c < 128 && is_potential_identifier_char(c)) {
307
197
        tok_backup(tok, c);
308
197
        _PyTokenizer_syntaxerror(tok, "invalid %s literal", kind);
309
197
        return 0;
310
197
    }
311
97.4k
    return 1;
312
97.6k
}
313
314
/* Verify that the identifier follows PEP 3131. */
315
static int
316
verify_identifier(struct tok_state *tok)
317
13.2k
{
318
13.2k
    if (tok->tok_extra_tokens) {
319
0
        return 1;
320
0
    }
321
13.2k
    PyObject *s;
322
13.2k
    if (tok->decoding_erred)
323
0
        return 0;
324
13.2k
    s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
325
13.2k
    if (s == NULL) {
326
983
        if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
327
983
            tok->done = E_DECODE;
328
983
        }
329
0
        else {
330
0
            tok->done = E_ERROR;
331
0
        }
332
983
        return 0;
333
983
    }
334
12.2k
    Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
335
12.2k
    assert(invalid >= 0);
336
12.2k
    assert(PyUnicode_GET_LENGTH(s) > 0);
337
12.2k
    if (invalid < PyUnicode_GET_LENGTH(s)) {
338
606
        Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
339
606
        if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
340
            /* Determine the offset in UTF-8 encoded input */
341
417
            Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
342
417
            if (s != NULL) {
343
417
                Py_SETREF(s, PyUnicode_AsUTF8String(s));
344
417
            }
345
417
            if (s == NULL) {
346
0
                tok->done = E_ERROR;
347
0
                return 0;
348
0
            }
349
417
            tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
350
417
        }
351
606
        Py_DECREF(s);
352
606
        if (Py_UNICODE_ISPRINTABLE(ch)) {
353
347
            _PyTokenizer_syntaxerror(tok, "invalid character '%c' (U+%04X)", ch, ch);
354
347
        }
355
259
        else {
356
259
            _PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", ch);
357
259
        }
358
606
        return 0;
359
606
    }
360
11.6k
    Py_DECREF(s);
361
11.6k
    return 1;
362
12.2k
}
363
364
static int
365
tok_decimal_tail(struct tok_state *tok)
366
79.5k
{
367
79.5k
    int c;
368
369
79.9k
    while (1) {
370
219k
        do {
371
219k
            c = tok_nextc(tok);
372
219k
        } while (Py_ISDIGIT(c));
373
79.9k
        if (c != '_') {
374
79.5k
            break;
375
79.5k
        }
376
467
        c = tok_nextc(tok);
377
467
        if (!Py_ISDIGIT(c)) {
378
16
            tok_backup(tok, c);
379
16
            _PyTokenizer_syntaxerror(tok, "invalid decimal literal");
380
16
            return 0;
381
16
        }
382
467
    }
383
79.5k
    return c;
384
79.5k
}
385
386
static inline int
387
1.20k
tok_continuation_line(struct tok_state *tok) {
388
1.20k
    int c = tok_nextc(tok);
389
1.20k
    if (c == '\r') {
390
68
        c = tok_nextc(tok);
391
68
    }
392
1.20k
    if (c != '\n') {
393
61
        tok->done = E_LINECONT;
394
61
        return -1;
395
61
    }
396
1.14k
    c = tok_nextc(tok);
397
1.14k
    if (c == EOF) {
398
37
        tok->done = E_EOF;
399
37
        tok->cur = tok->inp;
400
37
        return -1;
401
1.10k
    } else {
402
1.10k
        tok_backup(tok, c);
403
1.10k
    }
404
1.10k
    return c;
405
1.14k
}
406
407
static int
408
maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok,
409
                                             int saw_b, int saw_r, int saw_u,
410
22.1k
                                             int saw_f, int saw_t) {
411
    // Supported: rb, rf, rt (in any order)
412
    // Unsupported: ub, ur, uf, ut, bf, bt, ft (in any order)
413
414
22.1k
#define RETURN_SYNTAX_ERROR(PREFIX1, PREFIX2)                             \
415
22.1k
    do {                                                                  \
416
8
        (void)_PyTokenizer_syntaxerror_known_range(                       \
417
8
            tok, (int)(tok->start + 1 - tok->line_start),                 \
418
8
            (int)(tok->cur - tok->line_start),                            \
419
8
            "'" PREFIX1 "' and '" PREFIX2 "' prefixes are incompatible"); \
420
8
        return -1;                                                        \
421
8
    } while (0)
422
423
22.1k
    if (saw_u && saw_b) {
424
1
        RETURN_SYNTAX_ERROR("u", "b");
425
1
    }
426
22.1k
    if (saw_u && saw_r) {
427
1
        RETURN_SYNTAX_ERROR("u", "r");
428
1
    }
429
22.1k
    if (saw_u && saw_f) {
430
1
        RETURN_SYNTAX_ERROR("u", "f");
431
1
    }
432
22.1k
    if (saw_u && saw_t) {
433
1
        RETURN_SYNTAX_ERROR("u", "t");
434
1
    }
435
436
22.1k
    if (saw_b && saw_f) {
437
2
        RETURN_SYNTAX_ERROR("b", "f");
438
2
    }
439
22.1k
    if (saw_b && saw_t) {
440
1
        RETURN_SYNTAX_ERROR("b", "t");
441
1
    }
442
443
22.1k
    if (saw_f && saw_t) {
444
1
        RETURN_SYNTAX_ERROR("f", "t");
445
1
    }
446
447
22.1k
#undef RETURN_SYNTAX_ERROR
448
449
22.1k
    return 0;
450
22.1k
}
451
452
static int
453
tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
454
1.70M
{
455
1.70M
    int c;
456
1.70M
    int blankline, nonascii;
457
458
1.70M
    const char *p_start = NULL;
459
1.70M
    const char *p_end = NULL;
460
1.78M
  nextline:
461
1.78M
    tok->start = NULL;
462
1.78M
    tok->starting_col_offset = -1;
463
1.78M
    blankline = 0;
464
465
466
    /* Get indentation level */
467
1.78M
    if (tok->atbol) {
468
220k
        int col = 0;
469
220k
        int altcol = 0;
470
220k
        tok->atbol = 0;
471
220k
        int cont_line_col = 0;
472
829k
        for (;;) {
473
829k
            c = tok_nextc(tok);
474
829k
            if (c == ' ') {
475
602k
                col++, altcol++;
476
602k
            }
477
227k
            else if (c == '\t') {
478
4.05k
                col = (col / tok->tabsize + 1) * tok->tabsize;
479
4.05k
                altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
480
4.05k
            }
481
223k
            else if (c == '\014')  {/* Control-L (formfeed) */
482
2.04k
                col = altcol = 0; /* For Emacs users */
483
2.04k
            }
484
221k
            else if (c == '\\') {
485
                // Indentation cannot be split over multiple physical lines
486
                // using backslashes. This means that if we found a backslash
487
                // preceded by whitespace, **the first one we find** determines
488
                // the level of indentation of whatever comes next.
489
777
                cont_line_col = cont_line_col ? cont_line_col : col;
490
777
                if ((c = tok_continuation_line(tok)) == -1) {
491
28
                    return MAKE_TOKEN(ERRORTOKEN);
492
28
                }
493
777
            }
494
220k
            else {
495
220k
                break;
496
220k
            }
497
829k
        }
498
220k
        tok_backup(tok, c);
499
220k
        if (c == '#' || c == '\n' || c == '\r') {
500
            /* Lines with only whitespace and/or comments
501
               shouldn't affect the indentation and are
502
               not passed to the parser as NEWLINE tokens,
503
               except *totally* empty lines in interactive
504
               mode, which signal the end of a command group. */
505
44.8k
            if (col == 0 && c == '\n' && tok->prompt != NULL) {
506
0
                blankline = 0; /* Let it through */
507
0
            }
508
44.8k
            else if (tok->prompt != NULL && tok->lineno == 1) {
509
                /* In interactive mode, if the first line contains
510
                   only spaces and/or a comment, let it through. */
511
0
                blankline = 0;
512
0
                col = altcol = 0;
513
0
            }
514
44.8k
            else {
515
44.8k
                blankline = 1; /* Ignore completely */
516
44.8k
            }
517
            /* We can't jump back right here since we still
518
               may need to skip to the end of a comment */
519
44.8k
        }
520
220k
        if (!blankline && tok->level == 0) {
521
135k
            col = cont_line_col ? cont_line_col : col;
522
135k
            altcol = cont_line_col ? cont_line_col : altcol;
523
135k
            if (col == tok->indstack[tok->indent]) {
524
                /* No change */
525
101k
                if (altcol != tok->altindstack[tok->indent]) {
526
1
                    return MAKE_TOKEN(_PyTokenizer_indenterror(tok));
527
1
                }
528
101k
            }
529
33.8k
            else if (col > tok->indstack[tok->indent]) {
530
                /* Indent -- always one */
531
19.0k
                if (tok->indent+1 >= MAXINDENT) {
532
0
                    tok->done = E_TOODEEP;
533
0
                    tok->cur = tok->inp;
534
0
                    return MAKE_TOKEN(ERRORTOKEN);
535
0
                }
536
19.0k
                if (altcol <= tok->altindstack[tok->indent]) {
537
3
                    return MAKE_TOKEN(_PyTokenizer_indenterror(tok));
538
3
                }
539
19.0k
                tok->pendin++;
540
19.0k
                tok->indstack[++tok->indent] = col;
541
19.0k
                tok->altindstack[tok->indent] = altcol;
542
19.0k
            }
543
14.8k
            else /* col < tok->indstack[tok->indent] */ {
544
                /* Dedent -- any number, must be consistent */
545
33.0k
                while (tok->indent > 0 &&
546
33.0k
                    col < tok->indstack[tok->indent]) {
547
18.2k
                    tok->pendin--;
548
18.2k
                    tok->indent--;
549
18.2k
                }
550
14.8k
                if (col != tok->indstack[tok->indent]) {
551
11
                    tok->done = E_DEDENT;
552
11
                    tok->cur = tok->inp;
553
11
                    return MAKE_TOKEN(ERRORTOKEN);
554
11
                }
555
14.8k
                if (altcol != tok->altindstack[tok->indent]) {
556
1
                    return MAKE_TOKEN(_PyTokenizer_indenterror(tok));
557
1
                }
558
14.8k
            }
559
135k
        }
560
220k
    }
561
562
1.78M
    tok->start = tok->cur;
563
1.78M
    tok->starting_col_offset = tok->col_offset;
564
565
    /* Return pending indents/dedents */
566
1.78M
    if (tok->pendin != 0) {
567
37.2k
        if (tok->pendin < 0) {
568
18.2k
            if (tok->tok_extra_tokens) {
569
0
                p_start = tok->cur;
570
0
                p_end = tok->cur;
571
0
            }
572
18.2k
            tok->pendin++;
573
18.2k
            return MAKE_TOKEN(DEDENT);
574
18.2k
        }
575
19.0k
        else {
576
19.0k
            if (tok->tok_extra_tokens) {
577
0
                p_start = tok->buf;
578
0
                p_end = tok->cur;
579
0
            }
580
19.0k
            tok->pendin--;
581
19.0k
            return MAKE_TOKEN(INDENT);
582
19.0k
        }
583
37.2k
    }
584
585
    /* Peek ahead at the next character */
586
1.75M
    c = tok_nextc(tok);
587
1.75M
    tok_backup(tok, c);
588
589
1.75M
 again:
590
1.75M
    tok->start = NULL;
591
    /* Skip spaces */
592
2.09M
    do {
593
2.09M
        c = tok_nextc(tok);
594
2.09M
    } while (c == ' ' || c == '\t' || c == '\014');
595
596
    /* Set start of current token */
597
1.75M
    tok->start = tok->cur == NULL ? NULL : tok->cur - 1;
598
1.75M
    tok->starting_col_offset = tok->col_offset - 1;
599
600
    /* Skip comment, unless it's a type comment */
601
1.75M
    if (c == '#') {
602
603
42.0k
        const char* p = NULL;
604
42.0k
        const char *prefix, *type_start;
605
42.0k
        int current_starting_col_offset;
606
607
1.24M
        while (c != EOF && c != '\n' && c != '\r') {
608
1.20M
            c = tok_nextc(tok);
609
1.20M
        }
610
611
42.0k
        if (tok->tok_extra_tokens) {
612
0
            p = tok->start;
613
0
        }
614
615
42.0k
        if (tok->type_comments) {
616
0
            p = tok->start;
617
0
            current_starting_col_offset = tok->starting_col_offset;
618
0
            prefix = type_comment_prefix;
619
0
            while (*prefix && p < tok->cur) {
620
0
                if (*prefix == ' ') {
621
0
                    while (*p == ' ' || *p == '\t') {
622
0
                        p++;
623
0
                        current_starting_col_offset++;
624
0
                    }
625
0
                } else if (*prefix == *p) {
626
0
                    p++;
627
0
                    current_starting_col_offset++;
628
0
                } else {
629
0
                    break;
630
0
                }
631
632
0
                prefix++;
633
0
            }
634
635
            /* This is a type comment if we matched all of type_comment_prefix. */
636
0
            if (!*prefix) {
637
0
                int is_type_ignore = 1;
638
                // +6 in order to skip the word 'ignore'
639
0
                const char *ignore_end = p + 6;
640
0
                const int ignore_end_col_offset = current_starting_col_offset + 6;
641
0
                tok_backup(tok, c);  /* don't eat the newline or EOF */
642
643
0
                type_start = p;
644
645
                /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
646
                 * or anything ASCII and non-alphanumeric. */
647
0
                is_type_ignore = (
648
0
                    tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
649
0
                    && !(tok->cur > ignore_end
650
0
                         && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
651
652
0
                if (is_type_ignore) {
653
0
                    p_start = ignore_end;
654
0
                    p_end = tok->cur;
655
656
                    /* If this type ignore is the only thing on the line, consume the newline also. */
657
0
                    if (blankline) {
658
0
                        tok_nextc(tok);
659
0
                        tok->atbol = 1;
660
0
                    }
661
0
                    return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset);
662
0
                } else {
663
0
                    p_start = type_start;
664
0
                    p_end = tok->cur;
665
0
                    return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset);
666
0
                }
667
0
            }
668
0
        }
669
42.0k
        if (tok->tok_extra_tokens) {
670
0
            tok_backup(tok, c);  /* don't eat the newline or EOF */
671
0
            p_start = p;
672
0
            p_end = tok->cur;
673
0
            tok->comment_newline = blankline;
674
0
            return MAKE_TOKEN(COMMENT);
675
0
        }
676
42.0k
    }
677
678
1.75M
    if (tok->done == E_INTERACT_STOP) {
679
0
        return MAKE_TOKEN(ENDMARKER);
680
0
    }
681
682
    /* Check for EOF and errors now */
683
1.75M
    if (c == EOF) {
684
17.3k
        if (tok->level) {
685
4.29k
            return MAKE_TOKEN(ERRORTOKEN);
686
4.29k
        }
687
13.0k
        return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN);
688
17.3k
    }
689
690
    /* Identifier (most frequent token!) */
691
1.73M
    nonascii = 0;
692
1.73M
    if (is_potential_identifier_start(c)) {
693
        /* Process the various legal combinations of b"", r"", u"", and f"". */
694
523k
        int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0, saw_t = 0;
695
645k
        while (1) {
696
645k
            if (!saw_b && (c == 'b' || c == 'B')) {
697
20.7k
                saw_b = 1;
698
20.7k
            }
699
            /* Since this is a backwards compatibility support literal we don't
700
               want to support it in arbitrary order like byte literals. */
701
624k
            else if (!saw_u && (c == 'u'|| c == 'U')) {
702
6.30k
                saw_u = 1;
703
6.30k
            }
704
            /* ur"" and ru"" are not supported */
705
618k
            else if (!saw_r && (c == 'r' || c == 'R')) {
706
36.7k
                saw_r = 1;
707
36.7k
            }
708
581k
            else if (!saw_f && (c == 'f' || c == 'F')) {
709
49.1k
                saw_f = 1;
710
49.1k
            }
711
532k
            else if (!saw_t && (c == 't' || c == 'T')) {
712
30.9k
                saw_t = 1;
713
30.9k
            }
714
501k
            else {
715
501k
                break;
716
501k
            }
717
143k
            c = tok_nextc(tok);
718
143k
            if (c == '"' || c == '\'') {
719
                // Raise error on incompatible string prefixes:
720
22.1k
                int status = maybe_raise_syntax_error_for_string_prefixes(
721
22.1k
                    tok, saw_b, saw_r, saw_u, saw_f, saw_t);
722
22.1k
                if (status < 0) {
723
8
                    return MAKE_TOKEN(ERRORTOKEN);
724
8
                }
725
726
                // Handle valid f or t string creation:
727
22.1k
                if (saw_f || saw_t) {
728
16.9k
                    goto f_string_quote;
729
16.9k
                }
730
5.20k
                goto letter_quote;
731
22.1k
            }
732
143k
        }
733
2.19M
        while (is_potential_identifier_char(c)) {
734
1.69M
            if (c >= 128) {
735
174k
                nonascii = 1;
736
174k
            }
737
1.69M
            c = tok_nextc(tok);
738
1.69M
        }
739
501k
        tok_backup(tok, c);
740
501k
        if (nonascii && !verify_identifier(tok)) {
741
1.58k
            return MAKE_TOKEN(ERRORTOKEN);
742
1.58k
        }
743
744
499k
        p_start = tok->start;
745
499k
        p_end = tok->cur;
746
747
499k
        return MAKE_TOKEN(NAME);
748
501k
    }
749
750
1.21M
    if (c == '\r') {
751
429
        c = tok_nextc(tok);
752
429
    }
753
754
    /* Newline */
755
1.21M
    if (c == '\n') {
756
199k
        tok->atbol = 1;
757
199k
        if (blankline || tok->level > 0) {
758
84.6k
            if (tok->tok_extra_tokens) {
759
0
                if (tok->comment_newline) {
760
0
                    tok->comment_newline = 0;
761
0
                }
762
0
                p_start = tok->start;
763
0
                p_end = tok->cur;
764
0
                return MAKE_TOKEN(NL);
765
0
            }
766
84.6k
            goto nextline;
767
84.6k
        }
768
114k
        if (tok->comment_newline && tok->tok_extra_tokens) {
769
0
            tok->comment_newline = 0;
770
0
            p_start = tok->start;
771
0
            p_end = tok->cur;
772
0
            return MAKE_TOKEN(NL);
773
0
        }
774
114k
        p_start = tok->start;
775
114k
        p_end = tok->cur - 1; /* Leave '\n' out of the string */
776
114k
        tok->cont_line = 0;
777
114k
        return MAKE_TOKEN(NEWLINE);
778
114k
    }
779
780
    /* Period or number starting with period? */
781
1.01M
    if (c == '.') {
782
32.1k
        c = tok_nextc(tok);
783
32.1k
        if (Py_ISDIGIT(c)) {
784
3.22k
            goto fraction;
785
28.9k
        } else if (c == '.') {
786
3.49k
            c = tok_nextc(tok);
787
3.49k
            if (c == '.') {
788
2.85k
                p_start = tok->start;
789
2.85k
                p_end = tok->cur;
790
2.85k
                return MAKE_TOKEN(ELLIPSIS);
791
2.85k
            }
792
632
            else {
793
632
                tok_backup(tok, c);
794
632
            }
795
632
            tok_backup(tok, '.');
796
632
        }
797
25.4k
        else {
798
25.4k
            tok_backup(tok, c);
799
25.4k
        }
800
26.0k
        p_start = tok->start;
801
26.0k
        p_end = tok->cur;
802
26.0k
        return MAKE_TOKEN(DOT);
803
32.1k
    }
804
805
    /* Number */
806
979k
    if (Py_ISDIGIT(c)) {
807
94.5k
        if (c == '0') {
808
            /* Hex, octal or binary -- maybe. */
809
32.0k
            c = tok_nextc(tok);
810
32.0k
            if (c == 'x' || c == 'X') {
811
                /* Hex */
812
16.0k
                c = tok_nextc(tok);
813
16.2k
                do {
814
16.2k
                    if (c == '_') {
815
223
                        c = tok_nextc(tok);
816
223
                    }
817
16.2k
                    if (!Py_ISXDIGIT(c)) {
818
18
                        tok_backup(tok, c);
819
18
                        return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid hexadecimal literal"));
820
18
                    }
821
80.1k
                    do {
822
80.1k
                        c = tok_nextc(tok);
823
80.1k
                    } while (Py_ISXDIGIT(c));
824
16.2k
                } while (c == '_');
825
16.0k
                if (!verify_end_of_number(tok, c, "hexadecimal")) {
826
1
                    return MAKE_TOKEN(ERRORTOKEN);
827
1
                }
828
16.0k
            }
829
16.0k
            else if (c == 'o' || c == 'O') {
830
                /* Octal */
831
712
                c = tok_nextc(tok);
832
1.35k
                do {
833
1.35k
                    if (c == '_') {
834
647
                        c = tok_nextc(tok);
835
647
                    }
836
1.35k
                    if (c < '0' || c >= '8') {
837
25
                        if (Py_ISDIGIT(c)) {
838
1
                            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
839
1
                                    "invalid digit '%c' in octal literal", c));
840
1
                        }
841
24
                        else {
842
24
                            tok_backup(tok, c);
843
24
                            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid octal literal"));
844
24
                        }
845
25
                    }
846
4.39k
                    do {
847
4.39k
                        c = tok_nextc(tok);
848
4.39k
                    } while ('0' <= c && c < '8');
849
1.32k
                } while (c == '_');
850
687
                if (Py_ISDIGIT(c)) {
851
2
                    return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
852
2
                            "invalid digit '%c' in octal literal", c));
853
2
                }
854
685
                if (!verify_end_of_number(tok, c, "octal")) {
855
2
                    return MAKE_TOKEN(ERRORTOKEN);
856
2
                }
857
685
            }
858
15.3k
            else if (c == 'b' || c == 'B') {
859
                /* Binary */
860
584
                c = tok_nextc(tok);
861
896
                do {
862
896
                    if (c == '_') {
863
320
                        c = tok_nextc(tok);
864
320
                    }
865
896
                    if (c != '0' && c != '1') {
866
29
                        if (Py_ISDIGIT(c)) {
867
1
                            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid digit '%c' in binary literal", c));
868
1
                        }
869
28
                        else {
870
28
                            tok_backup(tok, c);
871
28
                            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid binary literal"));
872
28
                        }
873
29
                    }
874
4.01k
                    do {
875
4.01k
                        c = tok_nextc(tok);
876
4.01k
                    } while (c == '0' || c == '1');
877
867
                } while (c == '_');
878
555
                if (Py_ISDIGIT(c)) {
879
1
                    return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid digit '%c' in binary literal", c));
880
1
                }
881
554
                if (!verify_end_of_number(tok, c, "binary")) {
882
1
                    return MAKE_TOKEN(ERRORTOKEN);
883
1
                }
884
554
            }
885
14.7k
            else {
886
14.7k
                int nonzero = 0;
887
                /* maybe old-style octal; c is first char of it */
888
                /* in any case, allow '0' as a literal */
889
16.5k
                while (1) {
890
16.5k
                    if (c == '_') {
891
224
                        c = tok_nextc(tok);
892
224
                        if (!Py_ISDIGIT(c)) {
893
3
                            tok_backup(tok, c);
894
3
                            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal"));
895
3
                        }
896
224
                    }
897
16.5k
                    if (c != '0') {
898
14.7k
                        break;
899
14.7k
                    }
900
1.77k
                    c = tok_nextc(tok);
901
1.77k
                }
902
14.7k
                char* zeros_end = tok->cur;
903
14.7k
                if (Py_ISDIGIT(c)) {
904
581
                    nonzero = 1;
905
581
                    c = tok_decimal_tail(tok);
906
581
                    if (c == 0) {
907
2
                        return MAKE_TOKEN(ERRORTOKEN);
908
2
                    }
909
581
                }
910
14.7k
                if (c == '.') {
911
806
                    c = tok_nextc(tok);
912
806
                    goto fraction;
913
806
                }
914
13.9k
                else if (c == 'e' || c == 'E') {
915
1.07k
                    goto exponent;
916
1.07k
                }
917
12.8k
                else if (c == 'j' || c == 'J') {
918
873
                    goto imaginary;
919
873
                }
920
11.9k
                else if (nonzero && !tok->tok_extra_tokens) {
921
                    /* Old-style octal: now disallowed. */
922
21
                    tok_backup(tok, c);
923
21
                    return MAKE_TOKEN(_PyTokenizer_syntaxerror_known_range(
924
21
                            tok, (int)(tok->start + 1 - tok->line_start),
925
21
                            (int)(zeros_end - tok->line_start),
926
21
                            "leading zeros in decimal integer "
927
21
                            "literals are not permitted; "
928
21
                            "use an 0o prefix for octal integers"));
929
21
                }
930
11.9k
                if (!verify_end_of_number(tok, c, "decimal")) {
931
30
                    return MAKE_TOKEN(ERRORTOKEN);
932
30
                }
933
11.9k
            }
934
32.0k
        }
935
62.5k
        else {
936
            /* Decimal */
937
62.5k
            c = tok_decimal_tail(tok);
938
62.5k
            if (c == 0) {
939
11
                return MAKE_TOKEN(ERRORTOKEN);
940
11
            }
941
62.4k
            {
942
                /* Accept floating-point numbers. */
943
62.4k
                if (c == '.') {
944
3.80k
                    c = tok_nextc(tok);
945
7.82k
        fraction:
946
                    /* Fraction */
947
7.82k
                    if (Py_ISDIGIT(c)) {
948
6.05k
                        c = tok_decimal_tail(tok);
949
6.05k
                        if (c == 0) {
950
2
                            return MAKE_TOKEN(ERRORTOKEN);
951
2
                        }
952
6.05k
                    }
953
7.82k
                }
954
66.5k
                if (c == 'e' || c == 'E') {
955
9.88k
                    int e;
956
10.9k
                  exponent:
957
10.9k
                    e = c;
958
                    /* Exponent part */
959
10.9k
                    c = tok_nextc(tok);
960
10.9k
                    if (c == '+' || c == '-') {
961
3.55k
                        c = tok_nextc(tok);
962
3.55k
                        if (!Py_ISDIGIT(c)) {
963
9
                            tok_backup(tok, c);
964
9
                            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal"));
965
9
                        }
966
7.40k
                    } else if (!Py_ISDIGIT(c)) {
967
559
                        tok_backup(tok, c);
968
559
                        if (!verify_end_of_number(tok, e, "decimal")) {
969
47
                            return MAKE_TOKEN(ERRORTOKEN);
970
47
                        }
971
512
                        tok_backup(tok, e);
972
512
                        p_start = tok->start;
973
512
                        p_end = tok->cur;
974
512
                        return MAKE_TOKEN(NUMBER);
975
559
                    }
976
10.3k
                    c = tok_decimal_tail(tok);
977
10.3k
                    if (c == 0) {
978
1
                        return MAKE_TOKEN(ERRORTOKEN);
979
1
                    }
980
10.3k
                }
981
67.0k
                if (c == 'j' || c == 'J') {
982
                    /* Imaginary part */
983
4.53k
        imaginary:
984
4.53k
                    c = tok_nextc(tok);
985
4.53k
                    if (!verify_end_of_number(tok, c, "imaginary")) {
986
10
                        return MAKE_TOKEN(ERRORTOKEN);
987
10
                    }
988
4.53k
                }
989
63.3k
                else if (!verify_end_of_number(tok, c, "decimal")) {
990
106
                    return MAKE_TOKEN(ERRORTOKEN);
991
106
                }
992
67.0k
            }
993
67.0k
        }
994
96.9k
        tok_backup(tok, c);
995
96.9k
        p_start = tok->start;
996
96.9k
        p_end = tok->cur;
997
96.9k
        return MAKE_TOKEN(NUMBER);
998
94.5k
    }
999
1000
901k
  f_string_quote:
1001
901k
    if (((Py_TOLOWER(*tok->start) == 'f' || Py_TOLOWER(*tok->start) == 'r' || Py_TOLOWER(*tok->start) == 't')
1002
901k
        && (c == '\'' || c == '"'))) {
1003
1004
16.9k
        int quote = c;
1005
16.9k
        int quote_size = 1;             /* 1 or 3 */
1006
1007
        /* Nodes of type STRING, especially multi line strings
1008
           must be handled differently in order to get both
1009
           the starting line number and the column offset right.
1010
           (cf. issue 16806) */
1011
16.9k
        tok->first_lineno = tok->lineno;
1012
16.9k
        tok->multi_line_start = tok->line_start;
1013
1014
        /* Find the quote size and start of string */
1015
16.9k
        int after_quote = tok_nextc(tok);
1016
16.9k
        if (after_quote == quote) {
1017
2.35k
            int after_after_quote = tok_nextc(tok);
1018
2.35k
            if (after_after_quote == quote) {
1019
789
                quote_size = 3;
1020
789
            }
1021
1.56k
            else {
1022
                // TODO: Check this
1023
1.56k
                tok_backup(tok, after_after_quote);
1024
1.56k
                tok_backup(tok, after_quote);
1025
1.56k
            }
1026
2.35k
        }
1027
16.9k
        if (after_quote != quote) {
1028
14.5k
            tok_backup(tok, after_quote);
1029
14.5k
        }
1030
1031
1032
16.9k
        p_start = tok->start;
1033
16.9k
        p_end = tok->cur;
1034
16.9k
        if (tok->tok_mode_stack_index + 1 >= MAXFSTRINGLEVEL) {
1035
2
            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "too many nested f-strings or t-strings"));
1036
2
        }
1037
16.9k
        tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok);
1038
16.9k
        the_current_tok->kind = TOK_FSTRING_MODE;
1039
16.9k
        the_current_tok->quote = quote;
1040
16.9k
        the_current_tok->quote_size = quote_size;
1041
16.9k
        the_current_tok->start = tok->start;
1042
16.9k
        the_current_tok->multi_line_start = tok->line_start;
1043
16.9k
        the_current_tok->first_line = tok->lineno;
1044
16.9k
        the_current_tok->start_offset = -1;
1045
16.9k
        the_current_tok->multi_line_start_offset = -1;
1046
16.9k
        the_current_tok->last_expr_buffer = NULL;
1047
16.9k
        the_current_tok->last_expr_size = 0;
1048
16.9k
        the_current_tok->last_expr_end = -1;
1049
16.9k
        the_current_tok->in_format_spec = 0;
1050
16.9k
        the_current_tok->in_debug = 0;
1051
1052
16.9k
        enum string_kind_t string_kind = FSTRING;
1053
16.9k
        switch (*tok->start) {
1054
790
            case 'T':
1055
3.25k
            case 't':
1056
3.25k
                the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r';
1057
3.25k
                string_kind = TSTRING;
1058
3.25k
                break;
1059
2.08k
            case 'F':
1060
13.0k
            case 'f':
1061
13.0k
                the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r';
1062
13.0k
                break;
1063
265
            case 'R':
1064
656
            case 'r':
1065
656
                the_current_tok->raw = 1;
1066
656
                if (Py_TOLOWER(*(tok->start + 1)) == 't') {
1067
202
                    string_kind = TSTRING;
1068
202
                }
1069
656
                break;
1070
0
            default:
1071
0
                Py_UNREACHABLE();
1072
16.9k
        }
1073
1074
16.9k
        the_current_tok->string_kind = string_kind;
1075
16.9k
        the_current_tok->curly_bracket_depth = 0;
1076
16.9k
        the_current_tok->curly_bracket_expr_start_depth = -1;
1077
16.9k
        return string_kind == TSTRING ? MAKE_TOKEN(TSTRING_START) : MAKE_TOKEN(FSTRING_START);
1078
16.9k
    }
1079
1080
889k
  letter_quote:
1081
    /* String */
1082
889k
    if (c == '\'' || c == '"') {
1083
57.8k
        int quote = c;
1084
57.8k
        int quote_size = 1;             /* 1 or 3 */
1085
57.8k
        int end_quote_size = 0;
1086
57.8k
        int has_escaped_quote = 0;
1087
1088
        /* Nodes of type STRING, especially multi line strings
1089
           must be handled differently in order to get both
1090
           the starting line number and the column offset right.
1091
           (cf. issue 16806) */
1092
57.8k
        tok->first_lineno = tok->lineno;
1093
57.8k
        tok->multi_line_start = tok->line_start;
1094
1095
        /* Find the quote size and start of string */
1096
57.8k
        c = tok_nextc(tok);
1097
57.8k
        if (c == quote) {
1098
9.96k
            c = tok_nextc(tok);
1099
9.96k
            if (c == quote) {
1100
2.47k
                quote_size = 3;
1101
2.47k
            }
1102
7.49k
            else {
1103
7.49k
                end_quote_size = 1;     /* empty string found */
1104
7.49k
            }
1105
9.96k
        }
1106
57.8k
        if (c != quote) {
1107
55.4k
            tok_backup(tok, c);
1108
55.4k
        }
1109
1110
        /* Get rest of string */
1111
1.14M
        while (end_quote_size != quote_size) {
1112
1.08M
            c = tok_nextc(tok);
1113
1.08M
            if (tok->done == E_ERROR) {
1114
0
                return MAKE_TOKEN(ERRORTOKEN);
1115
0
            }
1116
1.08M
            if (tok->done == E_DECODE) {
1117
0
                break;
1118
0
            }
1119
1.08M
            if (c == EOF || (quote_size == 1 && c == '\n')) {
1120
427
                assert(tok->multi_line_start != NULL);
1121
                // shift the tok_state's location into
1122
                // the start of string, and report the error
1123
                // from the initial quote character
1124
427
                tok->cur = (char *)tok->start;
1125
427
                tok->cur++;
1126
427
                tok->line_start = tok->multi_line_start;
1127
427
                int start = tok->lineno;
1128
427
                tok->lineno = tok->first_lineno;
1129
1130
427
                if (INSIDE_FSTRING(tok)) {
1131
                    /* When we are in an f-string, before raising the
1132
                     * unterminated string literal error, check whether
1133
                     * does the initial quote matches with f-strings quotes
1134
                     * and if it is, then this must be a missing '}' token
1135
                     * so raise the proper error */
1136
22
                    tokenizer_mode *the_current_tok = TOK_GET_MODE(tok);
1137
22
                    if (the_current_tok->quote == quote &&
1138
22
                        the_current_tok->quote_size == quote_size) {
1139
12
                        return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1140
12
                            "%c-string: expecting '}'", TOK_GET_STRING_PREFIX(tok)));
1141
12
                    }
1142
22
                }
1143
1144
415
                if (quote_size == 3) {
1145
17
                    _PyTokenizer_syntaxerror(tok, "unterminated triple-quoted string literal"
1146
17
                                     " (detected at line %d)", start);
1147
17
                    if (c != '\n') {
1148
17
                        tok->done = E_EOFS;
1149
17
                    }
1150
17
                    return MAKE_TOKEN(ERRORTOKEN);
1151
17
                }
1152
398
                else {
1153
398
                    if (has_escaped_quote) {
1154
11
                        _PyTokenizer_syntaxerror(
1155
11
                            tok,
1156
11
                            "unterminated string literal (detected at line %d); "
1157
11
                            "perhaps you escaped the end quote?",
1158
11
                            start
1159
11
                        );
1160
387
                    } else {
1161
387
                        _PyTokenizer_syntaxerror(
1162
387
                            tok, "unterminated string literal (detected at line %d)", start
1163
387
                        );
1164
387
                    }
1165
398
                    if (c != '\n') {
1166
16
                        tok->done = E_EOLS;
1167
16
                    }
1168
398
                    return MAKE_TOKEN(ERRORTOKEN);
1169
398
                }
1170
415
            }
1171
1.08M
            if (c == quote) {
1172
56.3k
                end_quote_size += 1;
1173
56.3k
            }
1174
1.02M
            else {
1175
1.02M
                end_quote_size = 0;
1176
1.02M
                if (c == '\\') {
1177
32.1k
                    c = tok_nextc(tok);  /* skip escaped char */
1178
32.1k
                    if (c == quote) {  /* but record whether the escaped char was a quote */
1179
1.47k
                        has_escaped_quote = 1;
1180
1.47k
                    }
1181
32.1k
                    if (c == '\r') {
1182
205
                        c = tok_nextc(tok);
1183
205
                    }
1184
32.1k
                }
1185
1.02M
            }
1186
1.08M
        }
1187
1188
57.4k
        p_start = tok->start;
1189
57.4k
        p_end = tok->cur;
1190
57.4k
        return MAKE_TOKEN(STRING);
1191
57.8k
    }
1192
1193
    /* Line continuation */
1194
832k
    if (c == '\\') {
1195
425
        if ((c = tok_continuation_line(tok)) == -1) {
1196
70
            return MAKE_TOKEN(ERRORTOKEN);
1197
70
        }
1198
355
        tok->cont_line = 1;
1199
355
        goto again; /* Read next line */
1200
425
    }
1201
1202
    /* Punctuation character */
1203
831k
    int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{');
1204
831k
    if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) {
1205
        /* This code block gets executed before the curly_bracket_depth is incremented
1206
         * by the `{` case, so for ensuring that we are on the 0th level, we need
1207
         * to adjust it manually */
1208
52.8k
        int cursor = current_tok->curly_bracket_depth - (c != '{');
1209
52.8k
        int in_format_spec = current_tok->in_format_spec;
1210
52.8k
         int cursor_in_format_with_debug =
1211
52.8k
             cursor == 1 && (current_tok->in_debug || in_format_spec);
1212
52.8k
         int cursor_valid = cursor == 0 || cursor_in_format_with_debug;
1213
52.8k
        if ((cursor_valid) && !_PyLexer_update_ftstring_expr(tok, c)) {
1214
0
            return MAKE_TOKEN(ENDMARKER);
1215
0
        }
1216
52.8k
        if ((cursor_valid) && c != '{' && set_ftstring_expr(tok, token, c)) {
1217
6
            return MAKE_TOKEN(ERRORTOKEN);
1218
6
        }
1219
1220
52.7k
        if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) {
1221
3.96k
            current_tok->kind = TOK_FSTRING_MODE;
1222
3.96k
            current_tok->in_format_spec = 1;
1223
3.96k
            p_start = tok->start;
1224
3.96k
            p_end = tok->cur;
1225
3.96k
            return MAKE_TOKEN(_PyToken_OneChar(c));
1226
3.96k
        }
1227
52.7k
    }
1228
1229
    /* Check for two-character token */
1230
827k
    {
1231
827k
        int c2 = tok_nextc(tok);
1232
827k
        int current_token = _PyToken_TwoChars(c, c2);
1233
827k
        if (current_token != OP) {
1234
22.7k
            int c3 = tok_nextc(tok);
1235
22.7k
            int current_token3 = _PyToken_ThreeChars(c, c2, c3);
1236
22.7k
            if (current_token3 != OP) {
1237
1.20k
                current_token = current_token3;
1238
1.20k
            }
1239
21.5k
            else {
1240
21.5k
                tok_backup(tok, c3);
1241
21.5k
            }
1242
22.7k
            p_start = tok->start;
1243
22.7k
            p_end = tok->cur;
1244
22.7k
            return MAKE_TOKEN(current_token);
1245
22.7k
        }
1246
804k
        tok_backup(tok, c2);
1247
804k
    }
1248
1249
    /* Keep track of parentheses nesting level */
1250
0
    switch (c) {
1251
90.4k
    case '(':
1252
124k
    case '[':
1253
169k
    case '{':
1254
169k
        if (tok->level >= MAXLEVEL) {
1255
4
            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "too many nested parentheses"));
1256
4
        }
1257
169k
        tok->parenstack[tok->level] = c;
1258
169k
        tok->parenlinenostack[tok->level] = tok->lineno;
1259
169k
        tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
1260
169k
        tok->level++;
1261
169k
        if (INSIDE_FSTRING(tok)) {
1262
29.7k
            current_tok->curly_bracket_depth++;
1263
29.7k
        }
1264
169k
        break;
1265
59.2k
    case ')':
1266
70.9k
    case ']':
1267
96.8k
    case '}':
1268
96.8k
        if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') {
1269
48
            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1270
48
                "%c-string: single '}' is not allowed", TOK_GET_STRING_PREFIX(tok)));
1271
48
        }
1272
96.8k
        if (!tok->tok_extra_tokens && !tok->level) {
1273
205
            return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "unmatched '%c'", c));
1274
205
        }
1275
96.6k
        if (tok->level > 0) {
1276
96.6k
            tok->level--;
1277
96.6k
            int opening = tok->parenstack[tok->level];
1278
96.6k
            if (!tok->tok_extra_tokens && !((opening == '(' && c == ')') ||
1279
96.6k
                                            (opening == '[' && c == ']') ||
1280
96.6k
                                            (opening == '{' && c == '}'))) {
1281
                /* If the opening bracket belongs to an f-string's expression
1282
                part (e.g. f"{)}") and the closing bracket is an arbitrary
1283
                nested expression, then instead of matching a different
1284
                syntactical construct with it; we'll throw an unmatched
1285
                parentheses error. */
1286
42
                if (INSIDE_FSTRING(tok) && opening == '{') {
1287
9
                    assert(current_tok->curly_bracket_depth >= 0);
1288
9
                    int previous_bracket = current_tok->curly_bracket_depth - 1;
1289
9
                    if (previous_bracket == current_tok->curly_bracket_expr_start_depth) {
1290
6
                        return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1291
6
                            "%c-string: unmatched '%c'", TOK_GET_STRING_PREFIX(tok), c));
1292
6
                    }
1293
9
                }
1294
36
                if (tok->parenlinenostack[tok->level] != tok->lineno) {
1295
4
                    return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1296
4
                            "closing parenthesis '%c' does not match "
1297
4
                            "opening parenthesis '%c' on line %d",
1298
4
                            c, opening, tok->parenlinenostack[tok->level]));
1299
4
                }
1300
32
                else {
1301
32
                    return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1302
32
                            "closing parenthesis '%c' does not match "
1303
32
                            "opening parenthesis '%c'",
1304
32
                            c, opening));
1305
32
                }
1306
36
            }
1307
96.6k
        }
1308
1309
96.5k
        if (INSIDE_FSTRING(tok)) {
1310
22.0k
            current_tok->curly_bracket_depth--;
1311
22.0k
            if (current_tok->curly_bracket_depth < 0) {
1312
1
                return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "%c-string: unmatched '%c'",
1313
1
                    TOK_GET_STRING_PREFIX(tok), c));
1314
1
            }
1315
22.0k
            if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) {
1316
20.0k
                current_tok->curly_bracket_expr_start_depth--;
1317
20.0k
                current_tok->kind = TOK_FSTRING_MODE;
1318
20.0k
                current_tok->in_format_spec = 0;
1319
20.0k
                current_tok->in_debug = 0;
1320
20.0k
            }
1321
22.0k
        }
1322
96.5k
        break;
1323
538k
    default:
1324
538k
        break;
1325
804k
    }
1326
1327
804k
    if (!Py_UNICODE_ISPRINTABLE(c)) {
1328
443
        return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", c));
1329
443
    }
1330
1331
804k
    if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) {
1332
41.4k
        current_tok->in_debug = 1;
1333
41.4k
    }
1334
1335
    /* Punctuation character */
1336
804k
    p_start = tok->start;
1337
804k
    p_end = tok->cur;
1338
804k
    return MAKE_TOKEN(_PyToken_OneChar(c));
1339
804k
}
1340
1341
static int
1342
tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
1343
53.3k
{
1344
53.3k
    const char *p_start = NULL;
1345
53.3k
    const char *p_end = NULL;
1346
53.3k
    int end_quote_size = 0;
1347
53.3k
    int unicode_escape = 0;
1348
1349
53.3k
    tok->start = tok->cur;
1350
53.3k
    tok->first_lineno = tok->lineno;
1351
53.3k
    tok->starting_col_offset = tok->col_offset;
1352
1353
    // If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize
1354
    // before it.
1355
53.3k
    int start_char = tok_nextc(tok);
1356
53.3k
    if (start_char == '{') {
1357
13.6k
        int peek1 = tok_nextc(tok);
1358
13.6k
        tok_backup(tok, peek1);
1359
13.6k
        tok_backup(tok, start_char);
1360
13.6k
        if (peek1 != '{') {
1361
10.0k
            current_tok->curly_bracket_expr_start_depth++;
1362
10.0k
            if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) {
1363
4
                return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1364
4
                    "%c-string: expressions nested too deeply", TOK_GET_STRING_PREFIX(tok)));
1365
4
            }
1366
10.0k
            TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
1367
10.0k
            return tok_get_normal_mode(tok, current_tok, token);
1368
10.0k
        }
1369
13.6k
    }
1370
39.6k
    else {
1371
39.6k
        tok_backup(tok, start_char);
1372
39.6k
    }
1373
1374
    // Check if we are at the end of the string
1375
60.7k
    for (int i = 0; i < current_tok->quote_size; i++) {
1376
48.6k
        int quote = tok_nextc(tok);
1377
48.6k
        if (quote != current_tok->quote) {
1378
31.1k
            tok_backup(tok, quote);
1379
31.1k
            goto f_string_middle;
1380
31.1k
        }
1381
48.6k
    }
1382
1383
12.0k
    if (current_tok->last_expr_buffer != NULL) {
1384
6.10k
        PyMem_Free(current_tok->last_expr_buffer);
1385
6.10k
        current_tok->last_expr_buffer = NULL;
1386
6.10k
        current_tok->last_expr_size = 0;
1387
6.10k
        current_tok->last_expr_end = -1;
1388
6.10k
    }
1389
1390
12.0k
    p_start = tok->start;
1391
12.0k
    p_end = tok->cur;
1392
12.0k
    tok->tok_mode_stack_index--;
1393
12.0k
    return MAKE_TOKEN(FTSTRING_END(current_tok));
1394
1395
31.1k
f_string_middle:
1396
1397
    // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle
1398
    // this.
1399
31.1k
    tok->multi_line_start = tok->line_start;
1400
163k
    while (end_quote_size != current_tok->quote_size) {
1401
157k
        int c = tok_nextc(tok);
1402
157k
        if (tok->done == E_ERROR || tok->done == E_DECODE) {
1403
0
            return MAKE_TOKEN(ERRORTOKEN);
1404
0
        }
1405
157k
        int in_format_spec = (
1406
157k
                current_tok->in_format_spec
1407
157k
                &&
1408
157k
                INSIDE_FSTRING_EXPR(current_tok)
1409
157k
        );
1410
1411
157k
       if (c == EOF || (current_tok->quote_size == 1 && c == '\n')) {
1412
466
            if (tok->decoding_erred) {
1413
0
                return MAKE_TOKEN(ERRORTOKEN);
1414
0
            }
1415
1416
            // If we are in a format spec and we found a newline,
1417
            // it means that the format spec ends here and we should
1418
            // return to the regular mode.
1419
466
            if (in_format_spec && c == '\n') {
1420
75
                if (current_tok->quote_size == 1) {
1421
75
                    return MAKE_TOKEN(
1422
75
                        _PyTokenizer_syntaxerror(
1423
75
                            tok,
1424
75
                            "%c-string: newlines are not allowed in format specifiers for single quoted %c-strings",
1425
75
                            TOK_GET_STRING_PREFIX(tok), TOK_GET_STRING_PREFIX(tok)
1426
75
                        )
1427
75
                    );
1428
75
                }
1429
0
                tok_backup(tok, c);
1430
0
                TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
1431
0
                current_tok->in_format_spec = 0;
1432
0
                p_start = tok->start;
1433
0
                p_end = tok->cur;
1434
0
                return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok));
1435
75
            }
1436
1437
391
            assert(tok->multi_line_start != NULL);
1438
            // shift the tok_state's location into
1439
            // the start of string, and report the error
1440
            // from the initial quote character
1441
391
            tok->cur = (char *)current_tok->start;
1442
391
            tok->cur++;
1443
391
            tok->line_start = current_tok->multi_line_start;
1444
391
            int start = tok->lineno;
1445
1446
391
            tokenizer_mode *the_current_tok = TOK_GET_MODE(tok);
1447
391
            tok->lineno = the_current_tok->first_line;
1448
1449
391
            if (current_tok->quote_size == 3) {
1450
31
                _PyTokenizer_syntaxerror(tok,
1451
31
                                    "unterminated triple-quoted %c-string literal"
1452
31
                                    " (detected at line %d)",
1453
31
                                    TOK_GET_STRING_PREFIX(tok), start);
1454
31
                if (c != '\n') {
1455
31
                    tok->done = E_EOFS;
1456
31
                }
1457
31
                return MAKE_TOKEN(ERRORTOKEN);
1458
31
            }
1459
360
            else {
1460
360
                return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1461
360
                                    "unterminated %c-string literal (detected at"
1462
360
                                    " line %d)", TOK_GET_STRING_PREFIX(tok), start));
1463
360
            }
1464
391
        }
1465
1466
157k
        if (c == current_tok->quote) {
1467
8.81k
            end_quote_size += 1;
1468
8.81k
            continue;
1469
148k
        } else {
1470
148k
            end_quote_size = 0;
1471
148k
        }
1472
1473
148k
        if (c == '{') {
1474
19.7k
            if (!_PyLexer_update_ftstring_expr(tok, c)) {
1475
0
                return MAKE_TOKEN(ENDMARKER);
1476
0
            }
1477
19.7k
            int peek = tok_nextc(tok);
1478
19.7k
            if (peek != '{' || in_format_spec) {
1479
15.6k
                tok_backup(tok, peek);
1480
15.6k
                tok_backup(tok, c);
1481
15.6k
                current_tok->curly_bracket_expr_start_depth++;
1482
15.6k
                if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) {
1483
5
                    return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1484
5
                        "%c-string: expressions nested too deeply", TOK_GET_STRING_PREFIX(tok)));
1485
5
                }
1486
15.6k
                TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
1487
15.6k
                current_tok->in_format_spec = 0;
1488
15.6k
                p_start = tok->start;
1489
15.6k
                p_end = tok->cur;
1490
15.6k
            } else {
1491
4.07k
                p_start = tok->start;
1492
4.07k
                p_end = tok->cur - 1;
1493
4.07k
            }
1494
19.7k
            return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok));
1495
128k
        } else if (c == '}') {
1496
4.87k
            if (unicode_escape) {
1497
443
                p_start = tok->start;
1498
443
                p_end = tok->cur;
1499
443
                return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok));
1500
443
            }
1501
4.43k
            int peek = tok_nextc(tok);
1502
1503
            // The tokenizer can only be in the format spec if we have already completed the expression
1504
            // scanning (indicated by the end of the expression being set) and we are not at the top level
1505
            // of the bracket stack (-1 is the top level). Since format specifiers can't legally use double
1506
            // brackets, we can bypass it here.
1507
4.43k
            int cursor = current_tok->curly_bracket_depth;
1508
4.43k
            if (peek == '}' && !in_format_spec && cursor == 0) {
1509
1.82k
                p_start = tok->start;
1510
1.82k
                p_end = tok->cur - 1;
1511
2.61k
            } else {
1512
2.61k
                tok_backup(tok, peek);
1513
2.61k
                tok_backup(tok, c);
1514
2.61k
                TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
1515
2.61k
                current_tok->in_format_spec = 0;
1516
2.61k
                p_start = tok->start;
1517
2.61k
                p_end = tok->cur;
1518
2.61k
            }
1519
4.43k
            return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok));
1520
123k
        } else if (c == '\\') {
1521
7.22k
            int peek = tok_nextc(tok);
1522
7.22k
            if (peek == '\r') {
1523
69
                peek = tok_nextc(tok);
1524
69
            }
1525
            // Special case when the backslash is right before a curly
1526
            // brace. We have to restore and return the control back
1527
            // to the loop for the next iteration.
1528
7.22k
            if (peek == '{' || peek == '}') {
1529
1.51k
                if (!current_tok->raw) {
1530
1.32k
                    if (_PyTokenizer_warn_invalid_escape_sequence(tok, peek)) {
1531
0
                        return MAKE_TOKEN(ERRORTOKEN);
1532
0
                    }
1533
1.32k
                }
1534
1.51k
                tok_backup(tok, peek);
1535
1.51k
                continue;
1536
1.51k
            }
1537
1538
5.70k
            if (!current_tok->raw) {
1539
5.34k
                if (peek == 'N') {
1540
                    /* Handle named unicode escapes (\N{BULLET}) */
1541
689
                    peek = tok_nextc(tok);
1542
689
                    if (peek == '{') {
1543
470
                        unicode_escape = 1;
1544
470
                    } else {
1545
219
                        tok_backup(tok, peek);
1546
219
                    }
1547
689
                }
1548
5.34k
            } /* else {
1549
                skip the escaped character
1550
            }*/
1551
5.70k
        }
1552
148k
    }
1553
1554
    // Backup the f-string quotes to emit a final FSTRING_MIDDLE and
1555
    // add the quotes to the FSTRING_END in the next tokenizer iteration.
1556
13.0k
    for (int i = 0; i < current_tok->quote_size; i++) {
1557
6.94k
        tok_backup(tok, current_tok->quote);
1558
6.94k
    }
1559
6.07k
    p_start = tok->start;
1560
6.07k
    p_end = tok->cur;
1561
6.07k
    return MAKE_TOKEN(FTSTRING_MIDDLE(current_tok));
1562
31.1k
}
1563
1564
static int
1565
tok_get(struct tok_state *tok, struct token *token)
1566
1.74M
{
1567
1.74M
    tokenizer_mode *current_tok = TOK_GET_MODE(tok);
1568
1.74M
    if (current_tok->kind == TOK_REGULAR_MODE) {
1569
1.69M
        return tok_get_normal_mode(tok, current_tok, token);
1570
1.69M
    } else {
1571
53.3k
        return tok_get_fstring_mode(tok, current_tok, token);
1572
53.3k
    }
1573
1.74M
}
1574
1575
int
1576
_PyTokenizer_Get(struct tok_state *tok, struct token *token)
1577
1.74M
{
1578
1.74M
    int result = tok_get(tok, token);
1579
1.74M
    if (tok->decoding_erred) {
1580
0
        result = ERRORTOKEN;
1581
0
        tok->done = E_DECODE;
1582
0
    }
1583
1.74M
    return result;
1584
1.74M
}