Coverage Report

Created: 2025-08-26 06:26

/src/cpython/Parser/pegen.c
Line
Count
Source (jump to first uncovered line)
1
#include <Python.h>
2
#include "pycore_ast.h"           // _PyAST_Validate(),
3
#include "pycore_pystate.h"       // _PyThreadState_GET()
4
#include "pycore_parser.h"        // _PYPEGEN_NSTATISTICS
5
#include "pycore_pyerrors.h"      // PyExc_IncompleteInputError
6
#include "pycore_runtime.h"     // _PyRuntime
7
#include "pycore_unicodeobject.h" // _PyUnicode_InternImmortal
8
#include "pycore_pyatomic_ft_wrappers.h"
9
#include <errcode.h>
10
11
#include "lexer/lexer.h"
12
#include "tokenizer/tokenizer.h"
13
#include "pegen.h"
14
15
// Internal parser functions
16
17
asdl_stmt_seq*
18
_PyPegen_interactive_exit(Parser *p)
19
0
{
20
0
    if (p->errcode) {
21
0
        *(p->errcode) = E_EOF;
22
0
    }
23
0
    return NULL;
24
0
}
25
26
Py_ssize_t
27
_PyPegen_byte_offset_to_character_offset_line(PyObject *line, Py_ssize_t col_offset, Py_ssize_t end_col_offset)
28
0
{
29
0
    const unsigned char *data = (const unsigned char*)PyUnicode_AsUTF8(line);
30
31
0
    Py_ssize_t len = 0;
32
0
    while (col_offset < end_col_offset) {
33
0
        Py_UCS4 ch = data[col_offset];
34
0
        if (ch < 0x80) {
35
0
            col_offset += 1;
36
0
        } else if ((ch & 0xe0) == 0xc0) {
37
0
            col_offset += 2;
38
0
        } else if ((ch & 0xf0) == 0xe0) {
39
0
            col_offset += 3;
40
0
        } else if ((ch & 0xf8) == 0xf0) {
41
0
            col_offset += 4;
42
0
        } else {
43
0
            PyErr_SetString(PyExc_ValueError, "Invalid UTF-8 sequence");
44
0
            return -1;
45
0
        }
46
0
        len++;
47
0
    }
48
0
    return len;
49
0
}
50
51
Py_ssize_t
52
_PyPegen_byte_offset_to_character_offset_raw(const char* str, Py_ssize_t col_offset)
53
20.2k
{
54
20.2k
    Py_ssize_t len = (Py_ssize_t)strlen(str);
55
20.2k
    if (col_offset > len + 1) {
56
12
        col_offset = len + 1;
57
12
    }
58
20.2k
    assert(col_offset >= 0);
59
20.2k
    PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, "replace");
60
20.2k
    if (!text) {
61
0
        return -1;
62
0
    }
63
20.2k
    Py_ssize_t size = PyUnicode_GET_LENGTH(text);
64
20.2k
    Py_DECREF(text);
65
20.2k
    return size;
66
20.2k
}
67
68
Py_ssize_t
69
_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
70
20.2k
{
71
20.2k
    const char *str = PyUnicode_AsUTF8(line);
72
20.2k
    if (!str) {
73
0
        return -1;
74
0
    }
75
20.2k
    return _PyPegen_byte_offset_to_character_offset_raw(str, col_offset);
76
20.2k
}
77
78
// Here, mark is the start of the node, while p->mark is the end.
79
// If node==NULL, they should be the same.
80
int
81
_PyPegen_insert_memo(Parser *p, int mark, int type, void *node)
82
11.4M
{
83
    // Insert in front
84
11.4M
    Memo *m = _PyArena_Malloc(p->arena, sizeof(Memo));
85
11.4M
    if (m == NULL) {
86
0
        return -1;
87
0
    }
88
11.4M
    m->type = type;
89
11.4M
    m->node = node;
90
11.4M
    m->mark = p->mark;
91
11.4M
    m->next = p->tokens[mark]->memo;
92
11.4M
    p->tokens[mark]->memo = m;
93
11.4M
    return 0;
94
11.4M
}
95
96
// Like _PyPegen_insert_memo(), but updates an existing node if found.
97
int
98
_PyPegen_update_memo(Parser *p, int mark, int type, void *node)
99
8.86M
{
100
43.6M
    for (Memo *m = p->tokens[mark]->memo; m != NULL; m = m->next) {
101
38.0M
        if (m->type == type) {
102
            // Update existing node.
103
3.24M
            m->node = node;
104
3.24M
            m->mark = p->mark;
105
3.24M
            return 0;
106
3.24M
        }
107
38.0M
    }
108
    // Insert new node.
109
5.61M
    return _PyPegen_insert_memo(p, mark, type, node);
110
8.86M
}
111
112
static int
113
init_normalization(Parser *p)
114
56.5k
{
115
56.5k
    if (p->normalize) {
116
54.8k
        return 1;
117
54.8k
    }
118
1.65k
    p->normalize = PyImport_ImportModuleAttrString("unicodedata", "normalize");
119
1.65k
    if (!p->normalize)
120
0
    {
121
0
        return 0;
122
0
    }
123
1.65k
    return 1;
124
1.65k
}
125
126
static int
127
21.3k
growable_comment_array_init(growable_comment_array *arr, size_t initial_size) {
128
21.3k
    assert(initial_size > 0);
129
21.3k
    arr->items = PyMem_Malloc(initial_size * sizeof(*arr->items));
130
21.3k
    arr->size = initial_size;
131
21.3k
    arr->num_items = 0;
132
133
21.3k
    return arr->items != NULL;
134
21.3k
}
135
136
static int
137
0
growable_comment_array_add(growable_comment_array *arr, int lineno, char *comment) {
138
0
    if (arr->num_items >= arr->size) {
139
0
        size_t new_size = arr->size * 2;
140
0
        void *new_items_array = PyMem_Realloc(arr->items, new_size * sizeof(*arr->items));
141
0
        if (!new_items_array) {
142
0
            return 0;
143
0
        }
144
0
        arr->items = new_items_array;
145
0
        arr->size = new_size;
146
0
    }
147
148
0
    arr->items[arr->num_items].lineno = lineno;
149
0
    arr->items[arr->num_items].comment = comment;  // Take ownership
150
0
    arr->num_items++;
151
0
    return 1;
152
0
}
153
154
static void
155
21.3k
growable_comment_array_deallocate(growable_comment_array *arr) {
156
21.3k
    for (unsigned i = 0; i < arr->num_items; i++) {
157
0
        PyMem_Free(arr->items[i].comment);
158
0
    }
159
21.3k
    PyMem_Free(arr->items);
160
21.3k
}
161
162
static int
163
_get_keyword_or_name_type(Parser *p, struct token *new_token)
164
496k
{
165
496k
    Py_ssize_t name_len = new_token->end_col_offset - new_token->col_offset;
166
496k
    assert(name_len > 0);
167
168
496k
    if (name_len >= p->n_keyword_lists ||
169
496k
        p->keywords[name_len] == NULL ||
170
496k
        p->keywords[name_len]->type == -1) {
171
218k
        return NAME;
172
218k
    }
173
1.43M
    for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) {
174
1.27M
        if (strncmp(k->str, new_token->start, (size_t)name_len) == 0) {
175
121k
            return k->type;
176
121k
        }
177
1.27M
    }
178
156k
    return NAME;
179
277k
}
180
181
static int
182
1.75M
initialize_token(Parser *p, Token *parser_token, struct token *new_token, int token_type) {
183
1.75M
    assert(parser_token != NULL);
184
185
1.75M
    parser_token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, new_token) : token_type;
186
1.75M
    parser_token->bytes = PyBytes_FromStringAndSize(new_token->start, new_token->end - new_token->start);
187
1.75M
    if (parser_token->bytes == NULL) {
188
0
        return -1;
189
0
    }
190
1.75M
    if (_PyArena_AddPyObject(p->arena, parser_token->bytes) < 0) {
191
0
        Py_DECREF(parser_token->bytes);
192
0
        return -1;
193
0
    }
194
195
1.75M
    parser_token->metadata = NULL;
196
1.75M
    if (new_token->metadata != NULL) {
197
9.75k
        if (_PyArena_AddPyObject(p->arena, new_token->metadata) < 0) {
198
0
            Py_DECREF(new_token->metadata);
199
0
            return -1;
200
0
        }
201
9.75k
        parser_token->metadata = new_token->metadata;
202
9.75k
        new_token->metadata = NULL;
203
9.75k
    }
204
205
1.75M
    parser_token->level = new_token->level;
206
1.75M
    parser_token->lineno = new_token->lineno;
207
1.75M
    parser_token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->col_offset
208
1.75M
                                                                    : new_token->col_offset;
209
1.75M
    parser_token->end_lineno = new_token->end_lineno;
210
1.75M
    parser_token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->end_col_offset
211
1.75M
                                                                 : new_token->end_col_offset;
212
213
1.75M
    p->fill += 1;
214
215
1.75M
    if (token_type == ERRORTOKEN && p->tok->done == E_DECODE) {
216
858
        return _Pypegen_raise_decode_error(p);
217
858
    }
218
219
1.75M
    return (token_type == ERRORTOKEN ? _Pypegen_tokenizer_error(p) : 0);
220
1.75M
}
221
222
static int
223
80.3k
_resize_tokens_array(Parser *p) {
224
80.3k
    int newsize = p->size * 2;
225
80.3k
    Token **new_tokens = PyMem_Realloc(p->tokens, (size_t)newsize * sizeof(Token *));
226
80.3k
    if (new_tokens == NULL) {
227
0
        PyErr_NoMemory();
228
0
        return -1;
229
0
    }
230
80.3k
    p->tokens = new_tokens;
231
232
2.64M
    for (int i = p->size; i < newsize; i++) {
233
2.56M
        p->tokens[i] = PyMem_Calloc(1, sizeof(Token));
234
2.56M
        if (p->tokens[i] == NULL) {
235
0
            p->size = i; // Needed, in order to cleanup correctly after parser fails
236
0
            PyErr_NoMemory();
237
0
            return -1;
238
0
        }
239
2.56M
    }
240
80.3k
    p->size = newsize;
241
80.3k
    return 0;
242
80.3k
}
243
244
int
245
_PyPegen_fill_token(Parser *p)
246
1.75M
{
247
1.75M
    struct token new_token;
248
1.75M
    _PyToken_Init(&new_token);
249
1.75M
    int type = _PyTokenizer_Get(p->tok, &new_token);
250
251
    // Record and skip '# type: ignore' comments
252
1.75M
    while (type == TYPE_IGNORE) {
253
0
        Py_ssize_t len = new_token.end_col_offset - new_token.col_offset;
254
0
        char *tag = PyMem_Malloc((size_t)len + 1);
255
0
        if (tag == NULL) {
256
0
            PyErr_NoMemory();
257
0
            goto error;
258
0
        }
259
0
        strncpy(tag, new_token.start, (size_t)len);
260
0
        tag[len] = '\0';
261
        // Ownership of tag passes to the growable array
262
0
        if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) {
263
0
            PyErr_NoMemory();
264
0
            goto error;
265
0
        }
266
0
        type = _PyTokenizer_Get(p->tok, &new_token);
267
0
    }
268
269
    // If we have reached the end and we are in single input mode we need to insert a newline and reset the parsing
270
1.75M
    if (p->start_rule == Py_single_input && type == ENDMARKER && p->parsing_started) {
271
0
        type = NEWLINE; /* Add an extra newline */
272
0
        p->parsing_started = 0;
273
274
0
        if (p->tok->indent && !(p->flags & PyPARSE_DONT_IMPLY_DEDENT)) {
275
0
            p->tok->pendin = -p->tok->indent;
276
0
            p->tok->indent = 0;
277
0
        }
278
0
    }
279
1.75M
    else {
280
1.75M
        p->parsing_started = 1;
281
1.75M
    }
282
283
    // Check if we are at the limit of the token array capacity and resize if needed
284
1.75M
    if ((p->fill == p->size) && (_resize_tokens_array(p) != 0)) {
285
0
        goto error;
286
0
    }
287
288
1.75M
    Token *t = p->tokens[p->fill];
289
1.75M
    return initialize_token(p, t, &new_token, type);
290
0
error:
291
0
    _PyToken_Free(&new_token);
292
0
    return -1;
293
1.75M
}
294
295
#if defined(Py_DEBUG)
296
// Instrumentation to count the effectiveness of memoization.
297
// The array counts the number of tokens skipped by memoization,
298
// indexed by type.
299
300
#define NSTATISTICS _PYPEGEN_NSTATISTICS
301
#define memo_statistics _PyRuntime.parser.memo_statistics
302
303
void
304
_PyPegen_clear_memo_statistics(void)
305
{
306
    FT_MUTEX_LOCK(&_PyRuntime.parser.mutex);
307
    for (int i = 0; i < NSTATISTICS; i++) {
308
        memo_statistics[i] = 0;
309
    }
310
    FT_MUTEX_UNLOCK(&_PyRuntime.parser.mutex);
311
}
312
313
PyObject *
314
_PyPegen_get_memo_statistics(void)
315
{
316
    PyObject *ret = PyList_New(NSTATISTICS);
317
    if (ret == NULL) {
318
        return NULL;
319
    }
320
321
    FT_MUTEX_LOCK(&_PyRuntime.parser.mutex);
322
    for (int i = 0; i < NSTATISTICS; i++) {
323
        PyObject *value = PyLong_FromLong(memo_statistics[i]);
324
        if (value == NULL) {
325
            FT_MUTEX_UNLOCK(&_PyRuntime.parser.mutex);
326
            Py_DECREF(ret);
327
            return NULL;
328
        }
329
        // PyList_SetItem borrows a reference to value.
330
        if (PyList_SetItem(ret, i, value) < 0) {
331
            FT_MUTEX_UNLOCK(&_PyRuntime.parser.mutex);
332
            Py_DECREF(ret);
333
            return NULL;
334
        }
335
    }
336
    FT_MUTEX_UNLOCK(&_PyRuntime.parser.mutex);
337
    return ret;
338
}
339
#endif
340
341
int  // bool
342
_PyPegen_is_memoized(Parser *p, int type, void *pres)
343
43.5M
{
344
43.5M
    if (p->mark == p->fill) {
345
447k
        if (_PyPegen_fill_token(p) < 0) {
346
751
            p->error_indicator = 1;
347
751
            return -1;
348
751
        }
349
447k
    }
350
351
43.5M
    Token *t = p->tokens[p->mark];
352
353
127M
    for (Memo *m = t->memo; m != NULL; m = m->next) {
354
115M
        if (m->type == type) {
355
#if defined(Py_DEBUG)
356
            if (0 <= type && type < NSTATISTICS) {
357
                long count = m->mark - p->mark;
358
                // A memoized negative result counts for one.
359
                if (count <= 0) {
360
                    count = 1;
361
                }
362
                FT_MUTEX_LOCK(&_PyRuntime.parser.mutex);
363
                memo_statistics[type] += count;
364
                FT_MUTEX_UNLOCK(&_PyRuntime.parser.mutex);
365
            }
366
#endif
367
31.9M
            p->mark = m->mark;
368
31.9M
            *(void **)(pres) = m->node;
369
31.9M
            return 1;
370
31.9M
        }
371
115M
    }
372
11.6M
    return 0;
373
43.5M
}
374
375
#define LOOKAHEAD1(NAME, RES_TYPE)                                  \
376
    int                                                             \
377
    NAME (int positive, RES_TYPE (func)(Parser *), Parser *p)       \
378
2.26M
    {                                                               \
379
2.26M
        int mark = p->mark;                                         \
380
2.26M
        void *res = func(p);                                        \
381
2.26M
        p->mark = mark;                                             \
382
2.26M
        return (res != NULL) == positive;                           \
383
2.26M
    }
_PyPegen_lookahead
Line
Count
Source
378
2.26M
    {                                                               \
379
2.26M
        int mark = p->mark;                                         \
380
2.26M
        void *res = func(p);                                        \
381
2.26M
        p->mark = mark;                                             \
382
2.26M
        return (res != NULL) == positive;                           \
383
2.26M
    }
_PyPegen_lookahead_for_expr
Line
Count
Source
378
963
    {                                                               \
379
963
        int mark = p->mark;                                         \
380
963
        void *res = func(p);                                        \
381
963
        p->mark = mark;                                             \
382
963
        return (res != NULL) == positive;                           \
383
963
    }
Unexecuted instantiation: _PyPegen_lookahead_for_stmt
384
385
LOOKAHEAD1(_PyPegen_lookahead, void *)
386
LOOKAHEAD1(_PyPegen_lookahead_for_expr, expr_ty)
387
LOOKAHEAD1(_PyPegen_lookahead_for_stmt, stmt_ty)
388
#undef LOOKAHEAD1
389
390
#define LOOKAHEAD2(NAME, RES_TYPE, T)                                   \
391
    int                                                                 \
392
    NAME (int positive, RES_TYPE (func)(Parser *, T), Parser *p, T arg) \
393
3.45M
    {                                                                   \
394
3.45M
        int mark = p->mark;                                             \
395
3.45M
        void *res = func(p, arg);                                       \
396
3.45M
        p->mark = mark;                                                 \
397
3.45M
        return (res != NULL) == positive;                               \
398
3.45M
    }
_PyPegen_lookahead_with_int
Line
Count
Source
393
3.23M
    {                                                                   \
394
3.23M
        int mark = p->mark;                                             \
395
3.23M
        void *res = func(p, arg);                                       \
396
3.23M
        p->mark = mark;                                                 \
397
3.23M
        return (res != NULL) == positive;                               \
398
3.23M
    }
_PyPegen_lookahead_with_string
Line
Count
Source
393
214k
    {                                                                   \
394
214k
        int mark = p->mark;                                             \
395
214k
        void *res = func(p, arg);                                       \
396
214k
        p->mark = mark;                                                 \
397
214k
        return (res != NULL) == positive;                               \
398
214k
    }
399
400
LOOKAHEAD2(_PyPegen_lookahead_with_int, Token *, int)
401
LOOKAHEAD2(_PyPegen_lookahead_with_string, expr_ty, const char *)
402
#undef LOOKAHEAD2
403
404
Token *
405
_PyPegen_expect_token(Parser *p, int type)
406
53.1M
{
407
53.1M
    if (p->mark == p->fill) {
408
936k
        if (_PyPegen_fill_token(p) < 0) {
409
2.72k
            p->error_indicator = 1;
410
2.72k
            return NULL;
411
2.72k
        }
412
936k
    }
413
53.1M
    Token *t = p->tokens[p->mark];
414
53.1M
    if (t->type != type) {
415
46.6M
       return NULL;
416
46.6M
    }
417
6.46M
    p->mark += 1;
418
6.46M
    return t;
419
53.1M
}
420
421
void*
422
0
_PyPegen_expect_forced_result(Parser *p, void* result, const char* expected) {
423
424
0
    if (p->error_indicator == 1) {
425
0
        return NULL;
426
0
    }
427
0
    if (result == NULL) {
428
0
        RAISE_SYNTAX_ERROR("expected (%s)", expected);
429
0
        return NULL;
430
0
    }
431
0
    return result;
432
0
}
433
434
Token *
435
20.5k
_PyPegen_expect_forced_token(Parser *p, int type, const char* expected) {
436
437
20.5k
    if (p->error_indicator == 1) {
438
0
        return NULL;
439
0
    }
440
441
20.5k
    if (p->mark == p->fill) {
442
6.57k
        if (_PyPegen_fill_token(p) < 0) {
443
1
            p->error_indicator = 1;
444
1
            return NULL;
445
1
        }
446
6.57k
    }
447
20.5k
    Token *t = p->tokens[p->mark];
448
20.5k
    if (t->type != type) {
449
144
        RAISE_SYNTAX_ERROR_KNOWN_LOCATION(t, "expected '%s'", expected);
450
144
        return NULL;
451
144
    }
452
20.4k
    p->mark += 1;
453
20.4k
    return t;
454
20.5k
}
455
456
expr_ty
457
_PyPegen_expect_soft_keyword(Parser *p, const char *keyword)
458
435k
{
459
435k
    if (p->mark == p->fill) {
460
6.19k
        if (_PyPegen_fill_token(p) < 0) {
461
7
            p->error_indicator = 1;
462
7
            return NULL;
463
7
        }
464
6.19k
    }
465
435k
    Token *t = p->tokens[p->mark];
466
435k
    if (t->type != NAME) {
467
236k
        return NULL;
468
236k
    }
469
199k
    const char *s = PyBytes_AsString(t->bytes);
470
199k
    if (!s) {
471
0
        p->error_indicator = 1;
472
0
        return NULL;
473
0
    }
474
199k
    if (strcmp(s, keyword) != 0) {
475
170k
        return NULL;
476
170k
    }
477
28.2k
    return _PyPegen_name_token(p);
478
199k
}
479
480
Token *
481
_PyPegen_get_last_nonnwhitespace_token(Parser *p)
482
1.68M
{
483
1.68M
    assert(p->mark >= 0);
484
1.68M
    Token *token = NULL;
485
1.75M
    for (int m = p->mark - 1; m >= 0; m--) {
486
1.75M
        token = p->tokens[m];
487
1.75M
        if (token->type != ENDMARKER && (token->type < NEWLINE || token->type > DEDENT)) {
488
1.68M
            break;
489
1.68M
        }
490
1.75M
    }
491
1.68M
    return token;
492
1.68M
}
493
494
PyObject *
495
_PyPegen_new_identifier(Parser *p, const char *n)
496
2.32M
{
497
2.32M
    PyObject *id = PyUnicode_DecodeUTF8(n, (Py_ssize_t)strlen(n), NULL);
498
2.32M
    if (!id) {
499
0
        goto error;
500
0
    }
501
    /* Check whether there are non-ASCII characters in the
502
       identifier; if so, normalize to NFKC. */
503
2.32M
    if (!PyUnicode_IS_ASCII(id))
504
56.5k
    {
505
56.5k
        if (!init_normalization(p))
506
0
        {
507
0
            Py_DECREF(id);
508
0
            goto error;
509
0
        }
510
56.5k
        PyObject *form = PyUnicode_InternFromString("NFKC");
511
56.5k
        if (form == NULL)
512
0
        {
513
0
            Py_DECREF(id);
514
0
            goto error;
515
0
        }
516
56.5k
        PyObject *args[2] = {form, id};
517
56.5k
        PyObject *id2 = PyObject_Vectorcall(p->normalize, args, 2, NULL);
518
56.5k
        Py_DECREF(id);
519
56.5k
        Py_DECREF(form);
520
56.5k
        if (!id2) {
521
0
            goto error;
522
0
        }
523
524
56.5k
        if (!PyUnicode_Check(id2))
525
0
        {
526
0
            PyErr_Format(PyExc_TypeError,
527
0
                         "unicodedata.normalize() must return a string, not "
528
0
                         "%.200s",
529
0
                         _PyType_Name(Py_TYPE(id2)));
530
0
            Py_DECREF(id2);
531
0
            goto error;
532
0
        }
533
56.5k
        id = id2;
534
56.5k
    }
535
2.32M
    static const char * const forbidden[] = {
536
2.32M
        "None",
537
2.32M
        "True",
538
2.32M
        "False",
539
2.32M
        NULL
540
2.32M
    };
541
9.28M
    for (int i = 0; forbidden[i] != NULL; i++) {
542
6.96M
        if (_PyUnicode_EqualToASCIIString(id, forbidden[i])) {
543
1
            PyErr_Format(PyExc_ValueError,
544
1
                         "identifier field can't represent '%s' constant",
545
1
                         forbidden[i]);
546
1
            Py_DECREF(id);
547
1
            goto error;
548
1
        }
549
6.96M
    }
550
2.32M
    PyInterpreterState *interp = _PyInterpreterState_GET();
551
2.32M
    _PyUnicode_InternImmortal(interp, &id);
552
2.32M
    if (_PyArena_AddPyObject(p->arena, id) < 0)
553
0
    {
554
0
        Py_DECREF(id);
555
0
        goto error;
556
0
    }
557
2.32M
    return id;
558
559
1
error:
560
1
    p->error_indicator = 1;
561
1
    return NULL;
562
2.32M
}
563
564
static expr_ty
565
_PyPegen_name_from_token(Parser *p, Token* t)
566
5.53M
{
567
5.53M
    if (t == NULL) {
568
3.21M
        return NULL;
569
3.21M
    }
570
2.32M
    const char *s = PyBytes_AsString(t->bytes);
571
2.32M
    if (!s) {
572
0
        p->error_indicator = 1;
573
0
        return NULL;
574
0
    }
575
2.32M
    PyObject *id = _PyPegen_new_identifier(p, s);
576
2.32M
    if (id == NULL) {
577
1
        p->error_indicator = 1;
578
1
        return NULL;
579
1
    }
580
2.32M
    return _PyAST_Name(id, Load, t->lineno, t->col_offset, t->end_lineno,
581
2.32M
                       t->end_col_offset, p->arena);
582
2.32M
}
583
584
expr_ty
585
_PyPegen_name_token(Parser *p)
586
5.52M
{
587
5.52M
    Token *t = _PyPegen_expect_token(p, NAME);
588
5.52M
    return _PyPegen_name_from_token(p, t);
589
5.52M
}
590
591
void *
592
_PyPegen_string_token(Parser *p)
593
1.36M
{
594
1.36M
    return _PyPegen_expect_token(p, STRING);
595
1.36M
}
596
597
225k
expr_ty _PyPegen_soft_keyword_token(Parser *p) {
598
225k
    Token *t = _PyPegen_expect_token(p, NAME);
599
225k
    if (t == NULL) {
600
156k
        return NULL;
601
156k
    }
602
69.7k
    char *the_token;
603
69.7k
    Py_ssize_t size;
604
69.7k
    PyBytes_AsStringAndSize(t->bytes, &the_token, &size);
605
333k
    for (char **keyword = p->soft_keywords; *keyword != NULL; keyword++) {
606
267k
        if (strlen(*keyword) == (size_t)size &&
607
267k
            strncmp(*keyword, the_token, (size_t)size) == 0) {
608
4.60k
            return _PyPegen_name_from_token(p, t);
609
4.60k
        }
610
267k
    }
611
65.1k
    return NULL;
612
69.7k
}
613
614
static PyObject *
615
parsenumber_raw(const char *s)
616
257k
{
617
257k
    const char *end;
618
257k
    long x;
619
257k
    double dx;
620
257k
    Py_complex compl;
621
257k
    int imflag;
622
623
257k
    assert(s != NULL);
624
257k
    errno = 0;
625
257k
    end = s + strlen(s) - 1;
626
257k
    imflag = *end == 'j' || *end == 'J';
627
257k
    if (s[0] == '0') {
628
77.6k
        x = (long)PyOS_strtoul(s, (char **)&end, 0);
629
77.6k
        if (x < 0 && errno == 0) {
630
335
            return PyLong_FromString(s, (char **)0, 0);
631
335
        }
632
77.6k
    }
633
179k
    else {
634
179k
        x = PyOS_strtol(s, (char **)&end, 0);
635
179k
    }
636
257k
    if (*end == '\0') {
637
195k
        if (errno != 0) {
638
2.76k
            return PyLong_FromString(s, (char **)0, 0);
639
2.76k
        }
640
192k
        return PyLong_FromLong(x);
641
195k
    }
642
    /* XXX Huge floats may silently fail */
643
61.4k
    if (imflag) {
644
10.4k
        compl.real = 0.;
645
10.4k
        compl.imag = PyOS_string_to_double(s, (char **)&end, NULL);
646
10.4k
        if (compl.imag == -1.0 && PyErr_Occurred()) {
647
1
            return NULL;
648
1
        }
649
10.4k
        return PyComplex_FromCComplex(compl);
650
10.4k
    }
651
51.0k
    dx = PyOS_string_to_double(s, NULL, NULL);
652
51.0k
    if (dx == -1.0 && PyErr_Occurred()) {
653
14
        return NULL;
654
14
    }
655
51.0k
    return PyFloat_FromDouble(dx);
656
51.0k
}
657
658
static PyObject *
659
parsenumber(const char *s)
660
257k
{
661
257k
    char *dup;
662
257k
    char *end;
663
257k
    PyObject *res = NULL;
664
665
257k
    assert(s != NULL);
666
667
257k
    if (strchr(s, '_') == NULL) {
668
256k
        return parsenumber_raw(s);
669
256k
    }
670
    /* Create a duplicate without underscores. */
671
1.10k
    dup = PyMem_Malloc(strlen(s) + 1);
672
1.10k
    if (dup == NULL) {
673
0
        return PyErr_NoMemory();
674
0
    }
675
1.10k
    end = dup;
676
17.7k
    for (; *s; s++) {
677
16.6k
        if (*s != '_') {
678
13.4k
            *end++ = *s;
679
13.4k
        }
680
16.6k
    }
681
1.10k
    *end = '\0';
682
1.10k
    res = parsenumber_raw(dup);
683
1.10k
    PyMem_Free(dup);
684
1.10k
    return res;
685
1.10k
}
686
687
expr_ty
688
_PyPegen_number_token(Parser *p)
689
909k
{
690
909k
    Token *t = _PyPegen_expect_token(p, NUMBER);
691
909k
    if (t == NULL) {
692
652k
        return NULL;
693
652k
    }
694
695
257k
    const char *num_raw = PyBytes_AsString(t->bytes);
696
257k
    if (num_raw == NULL) {
697
0
        p->error_indicator = 1;
698
0
        return NULL;
699
0
    }
700
701
257k
    if (p->feature_version < 6 && strchr(num_raw, '_') != NULL) {
702
0
        p->error_indicator = 1;
703
0
        return RAISE_SYNTAX_ERROR("Underscores in numeric literals are only supported "
704
0
                                  "in Python 3.6 and greater");
705
0
    }
706
707
257k
    PyObject *c = parsenumber(num_raw);
708
709
257k
    if (c == NULL) {
710
15
        p->error_indicator = 1;
711
15
        PyThreadState *tstate = _PyThreadState_GET();
712
        // The only way a ValueError should happen in _this_ code is via
713
        // PyLong_FromString hitting a length limit.
714
15
        if (tstate->current_exception != NULL &&
715
15
            Py_TYPE(tstate->current_exception) == (PyTypeObject *)PyExc_ValueError
716
15
        ) {
717
15
            PyObject *exc = PyErr_GetRaisedException();
718
            /* Intentionally omitting columns to avoid a wall of 1000s of '^'s
719
             * on the error message. Nobody is going to overlook their huge
720
             * numeric literal once given the line. */
721
15
            RAISE_ERROR_KNOWN_LOCATION(
722
15
                p, PyExc_SyntaxError,
723
15
                t->lineno, -1 /* col_offset */,
724
15
                t->end_lineno, -1 /* end_col_offset */,
725
15
                "%S - Consider hexadecimal for huge integer literals "
726
15
                "to avoid decimal conversion limits.",
727
15
                exc);
728
15
            Py_DECREF(exc);
729
15
        }
730
15
        return NULL;
731
15
    }
732
733
257k
    if (_PyArena_AddPyObject(p->arena, c) < 0) {
734
0
        Py_DECREF(c);
735
0
        p->error_indicator = 1;
736
0
        return NULL;
737
0
    }
738
739
257k
    return _PyAST_Constant(c, NULL, t->lineno, t->col_offset, t->end_lineno,
740
257k
                           t->end_col_offset, p->arena);
741
257k
}
742
743
/* Check that the source for a single input statement really is a single
744
   statement by looking at what is left in the buffer after parsing.
745
   Trailing whitespace and comments are OK. */
746
static int // bool
747
bad_single_statement(Parser *p)
748
0
{
749
0
    char *cur = p->tok->cur;
750
0
    char c = *cur;
751
752
0
    for (;;) {
753
0
        while (c == ' ' || c == '\t' || c == '\n' || c == '\014') {
754
0
            c = *++cur;
755
0
        }
756
757
0
        if (!c) {
758
0
            return 0;
759
0
        }
760
761
0
        if (c != '#') {
762
0
            return 1;
763
0
        }
764
765
        /* Suck up comment. */
766
0
        while (c && c != '\n') {
767
0
            c = *++cur;
768
0
        }
769
0
    }
770
0
}
771
772
static int
773
compute_parser_flags(PyCompilerFlags *flags)
774
21.3k
{
775
21.3k
    int parser_flags = 0;
776
21.3k
    if (!flags) {
777
48
        return 0;
778
48
    }
779
21.3k
    if (flags->cf_flags & PyCF_DONT_IMPLY_DEDENT) {
780
0
        parser_flags |= PyPARSE_DONT_IMPLY_DEDENT;
781
0
    }
782
21.3k
    if (flags->cf_flags & PyCF_IGNORE_COOKIE) {
783
78
        parser_flags |= PyPARSE_IGNORE_COOKIE;
784
78
    }
785
21.3k
    if (flags->cf_flags & CO_FUTURE_BARRY_AS_BDFL) {
786
0
        parser_flags |= PyPARSE_BARRY_AS_BDFL;
787
0
    }
788
21.3k
    if (flags->cf_flags & PyCF_TYPE_COMMENTS) {
789
0
        parser_flags |= PyPARSE_TYPE_COMMENTS;
790
0
    }
791
21.3k
    if (flags->cf_flags & PyCF_ALLOW_INCOMPLETE_INPUT) {
792
0
        parser_flags |= PyPARSE_ALLOW_INCOMPLETE_INPUT;
793
0
    }
794
21.3k
    return parser_flags;
795
21.3k
}
796
797
// Parser API
798
799
Parser *
800
_PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags,
801
                    int feature_version, int *errcode, const char* source, PyArena *arena)
802
21.3k
{
803
21.3k
    Parser *p = PyMem_Malloc(sizeof(Parser));
804
21.3k
    if (p == NULL) {
805
0
        return (Parser *) PyErr_NoMemory();
806
0
    }
807
21.3k
    assert(tok != NULL);
808
21.3k
    tok->type_comments = (flags & PyPARSE_TYPE_COMMENTS) > 0;
809
21.3k
    p->tok = tok;
810
21.3k
    p->keywords = NULL;
811
21.3k
    p->n_keyword_lists = -1;
812
21.3k
    p->soft_keywords = NULL;
813
21.3k
    p->tokens = PyMem_Malloc(sizeof(Token *));
814
21.3k
    if (!p->tokens) {
815
0
        PyMem_Free(p);
816
0
        return (Parser *) PyErr_NoMemory();
817
0
    }
818
21.3k
    p->tokens[0] = PyMem_Calloc(1, sizeof(Token));
819
21.3k
    if (!p->tokens[0]) {
820
0
        PyMem_Free(p->tokens);
821
0
        PyMem_Free(p);
822
0
        return (Parser *) PyErr_NoMemory();
823
0
    }
824
21.3k
    if (!growable_comment_array_init(&p->type_ignore_comments, 10)) {
825
0
        PyMem_Free(p->tokens[0]);
826
0
        PyMem_Free(p->tokens);
827
0
        PyMem_Free(p);
828
0
        return (Parser *) PyErr_NoMemory();
829
0
    }
830
831
21.3k
    p->mark = 0;
832
21.3k
    p->fill = 0;
833
21.3k
    p->size = 1;
834
835
21.3k
    p->errcode = errcode;
836
21.3k
    p->arena = arena;
837
21.3k
    p->start_rule = start_rule;
838
21.3k
    p->parsing_started = 0;
839
21.3k
    p->normalize = NULL;
840
21.3k
    p->error_indicator = 0;
841
842
21.3k
    p->starting_lineno = 0;
843
21.3k
    p->starting_col_offset = 0;
844
21.3k
    p->flags = flags;
845
21.3k
    p->feature_version = feature_version;
846
21.3k
    p->known_err_token = NULL;
847
21.3k
    p->level = 0;
848
21.3k
    p->call_invalid_rules = 0;
849
21.3k
    p->last_stmt_location.lineno = 0;
850
21.3k
    p->last_stmt_location.col_offset = 0;
851
21.3k
    p->last_stmt_location.end_lineno = 0;
852
21.3k
    p->last_stmt_location.end_col_offset = 0;
853
#ifdef Py_DEBUG
854
    p->debug = _Py_GetConfig()->parser_debug;
855
#endif
856
21.3k
    return p;
857
21.3k
}
858
859
void
860
_PyPegen_Parser_Free(Parser *p)
861
21.3k
{
862
21.3k
    Py_XDECREF(p->normalize);
863
2.60M
    for (int i = 0; i < p->size; i++) {
864
2.58M
        PyMem_Free(p->tokens[i]);
865
2.58M
    }
866
21.3k
    PyMem_Free(p->tokens);
867
21.3k
    growable_comment_array_deallocate(&p->type_ignore_comments);
868
21.3k
    PyMem_Free(p);
869
21.3k
}
870
871
static void
872
reset_parser_state_for_error_pass(Parser *p)
873
13.6k
{
874
13.6k
    p->last_stmt_location.lineno = 0;
875
13.6k
    p->last_stmt_location.col_offset = 0;
876
13.6k
    p->last_stmt_location.end_lineno = 0;
877
13.6k
    p->last_stmt_location.end_col_offset = 0;
878
512k
    for (int i = 0; i < p->fill; i++) {
879
498k
        p->tokens[i]->memo = NULL;
880
498k
    }
881
13.6k
    p->mark = 0;
882
13.6k
    p->call_invalid_rules = 1;
883
    // Don't try to get extra tokens in interactive mode when trying to
884
    // raise specialized errors in the second pass.
885
13.6k
    p->tok->interactive_underflow = IUNDERFLOW_STOP;
886
13.6k
}
887
888
static inline int
889
0
_is_end_of_source(Parser *p) {
890
0
    int err = p->tok->done;
891
0
    return err == E_EOF || err == E_EOFS || err == E_EOLS;
892
0
}
893
894
static void
895
13.3k
_PyPegen_set_syntax_error_metadata(Parser *p) {
896
13.3k
    PyObject *exc = PyErr_GetRaisedException();
897
13.3k
    if (!exc || !PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_SyntaxError)) {
898
0
        PyErr_SetRaisedException(exc);
899
0
        return;
900
0
    }
901
13.3k
    const char *source = NULL;
902
13.3k
    if (p->tok->str != NULL) {
903
13.3k
        source = p->tok->str;
904
13.3k
    }
905
13.3k
    if (!source && p->tok->fp_interactive && p->tok->interactive_src_start) {
906
0
        source = p->tok->interactive_src_start;
907
0
    }
908
13.3k
    PyObject* the_source = NULL;
909
13.3k
    if (source) {
910
13.3k
        if (p->tok->encoding == NULL) {
911
11.8k
            the_source = PyUnicode_FromString(source);
912
11.8k
        } else {
913
1.50k
            the_source = PyUnicode_Decode(source, strlen(source), p->tok->encoding, NULL);
914
1.50k
        }
915
13.3k
    }
916
13.3k
    if (!the_source) {
917
1.49k
        PyErr_Clear();
918
1.49k
        the_source = Py_None;
919
1.49k
        Py_INCREF(the_source);
920
1.49k
    }
921
13.3k
    PyObject* metadata = Py_BuildValue(
922
13.3k
        "(iiN)",
923
13.3k
        p->last_stmt_location.lineno,
924
13.3k
        p->last_stmt_location.col_offset,
925
13.3k
        the_source // N gives ownership to metadata
926
13.3k
    );
927
13.3k
    if (!metadata) {
928
0
        Py_DECREF(the_source);
929
0
        PyErr_Clear();
930
0
        return;
931
0
    }
932
13.3k
    PySyntaxErrorObject *syntax_error = (PySyntaxErrorObject *)exc;
933
934
13.3k
    Py_XDECREF(syntax_error->metadata);
935
13.3k
    syntax_error->metadata = metadata;
936
13.3k
    PyErr_SetRaisedException(exc);
937
13.3k
}
938
939
void *
940
_PyPegen_run_parser(Parser *p)
941
21.3k
{
942
21.3k
    void *res = _PyPegen_parse(p);
943
21.3k
    assert(p->level == 0);
944
21.3k
    if (res == NULL) {
945
13.6k
        if ((p->flags & PyPARSE_ALLOW_INCOMPLETE_INPUT) &&  _is_end_of_source(p)) {
946
0
            PyErr_Clear();
947
0
            return _PyPegen_raise_error(p, PyExc_IncompleteInputError, 0, "incomplete input");
948
0
        }
949
13.6k
        if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_SyntaxError)) {
950
16
            return NULL;
951
16
        }
952
       // Make a second parser pass. In this pass we activate heavier and slower checks
953
        // to produce better error messages and more complete diagnostics. Extra "invalid_*"
954
        // rules will be active during parsing.
955
13.6k
        Token *last_token = p->tokens[p->fill - 1];
956
13.6k
        reset_parser_state_for_error_pass(p);
957
13.6k
        _PyPegen_parse(p);
958
959
        // Set SyntaxErrors accordingly depending on the parser/tokenizer status at the failure
960
        // point.
961
13.6k
        _Pypegen_set_syntax_error(p, last_token);
962
963
        // Set the metadata in the exception from p->last_stmt_location
964
13.6k
        if (PyErr_ExceptionMatches(PyExc_SyntaxError)) {
965
13.3k
            _PyPegen_set_syntax_error_metadata(p);
966
13.3k
        }
967
13.6k
       return NULL;
968
13.6k
    }
969
970
7.69k
    if (p->start_rule == Py_single_input && bad_single_statement(p)) {
971
0
        p->tok->done = E_BADSINGLE; // This is not necessary for now, but might be in the future
972
0
        return RAISE_SYNTAX_ERROR("multiple statements found while compiling a single statement");
973
0
    }
974
975
    // test_peg_generator defines _Py_TEST_PEGEN to not call PyAST_Validate()
976
#if defined(Py_DEBUG) && !defined(_Py_TEST_PEGEN)
977
    if (p->start_rule == Py_single_input ||
978
        p->start_rule == Py_file_input ||
979
        p->start_rule == Py_eval_input)
980
    {
981
        if (!_PyAST_Validate(res)) {
982
            return NULL;
983
        }
984
    }
985
#endif
986
7.69k
    return res;
987
7.69k
}
988
989
mod_ty
990
_PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filename_ob,
991
                             const char *enc, const char *ps1, const char *ps2,
992
                             PyCompilerFlags *flags, int *errcode,
993
                             PyObject **interactive_src, PyArena *arena)
994
0
{
995
0
    struct tok_state *tok = _PyTokenizer_FromFile(fp, enc, ps1, ps2);
996
0
    if (tok == NULL) {
997
0
        if (PyErr_Occurred()) {
998
0
            _PyPegen_raise_tokenizer_init_error(filename_ob);
999
0
            return NULL;
1000
0
        }
1001
0
        return NULL;
1002
0
    }
1003
0
    if (!tok->fp || ps1 != NULL || ps2 != NULL ||
1004
0
        PyUnicode_CompareWithASCIIString(filename_ob, "<stdin>") == 0) {
1005
0
        tok->fp_interactive = 1;
1006
0
    }
1007
    // This transfers the ownership to the tokenizer
1008
0
    tok->filename = Py_NewRef(filename_ob);
1009
1010
    // From here on we need to clean up even if there's an error
1011
0
    mod_ty result = NULL;
1012
1013
0
    int parser_flags = compute_parser_flags(flags);
1014
0
    Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, PY_MINOR_VERSION,
1015
0
                                    errcode, NULL, arena);
1016
0
    if (p == NULL) {
1017
0
        goto error;
1018
0
    }
1019
1020
0
    result = _PyPegen_run_parser(p);
1021
0
    _PyPegen_Parser_Free(p);
1022
1023
0
    if (tok->fp_interactive && tok->interactive_src_start && result && interactive_src != NULL) {
1024
0
        *interactive_src = PyUnicode_FromString(tok->interactive_src_start);
1025
0
        if (!interactive_src || _PyArena_AddPyObject(arena, *interactive_src) < 0) {
1026
0
            Py_XDECREF(interactive_src);
1027
0
            result = NULL;
1028
0
            goto error;
1029
0
        }
1030
0
    }
1031
1032
0
error:
1033
0
    _PyTokenizer_Free(tok);
1034
0
    return result;
1035
0
}
1036
1037
mod_ty
1038
_PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filename_ob,
1039
                       PyCompilerFlags *flags, PyArena *arena)
1040
23.1k
{
1041
23.1k
    int exec_input = start_rule == Py_file_input;
1042
1043
23.1k
    struct tok_state *tok;
1044
23.1k
    if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) {
1045
78
        tok = _PyTokenizer_FromUTF8(str, exec_input, 0);
1046
23.0k
    } else {
1047
23.0k
        tok = _PyTokenizer_FromString(str, exec_input, 0);
1048
23.0k
    }
1049
23.1k
    if (tok == NULL) {
1050
1.79k
        if (PyErr_Occurred()) {
1051
1.79k
            _PyPegen_raise_tokenizer_init_error(filename_ob);
1052
1.79k
        }
1053
1.79k
        return NULL;
1054
1.79k
    }
1055
    // This transfers the ownership to the tokenizer
1056
21.3k
    tok->filename = Py_NewRef(filename_ob);
1057
1058
    // We need to clear up from here on
1059
21.3k
    mod_ty result = NULL;
1060
1061
21.3k
    int parser_flags = compute_parser_flags(flags);
1062
21.3k
    int feature_version = flags && (flags->cf_flags & PyCF_ONLY_AST) ?
1063
20.9k
        flags->cf_feature_version : PY_MINOR_VERSION;
1064
21.3k
    Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, feature_version,
1065
21.3k
                                    NULL, str, arena);
1066
21.3k
    if (p == NULL) {
1067
0
        goto error;
1068
0
    }
1069
1070
21.3k
    result = _PyPegen_run_parser(p);
1071
21.3k
    _PyPegen_Parser_Free(p);
1072
1073
21.3k
error:
1074
21.3k
    _PyTokenizer_Free(tok);
1075
21.3k
    return result;
1076
21.3k
}