Coverage Report

Created: 2025-07-11 06:59

/src/Python-3.8.3/Parser/tokenizer.c
Line
Count
Source (jump to first uncovered line)
1
2
/* Tokenizer implementation */
3
4
#define PY_SSIZE_T_CLEAN
5
#include "Python.h"
6
7
#include <ctype.h>
8
#include <assert.h>
9
10
#include "tokenizer.h"
11
#include "errcode.h"
12
13
#include "unicodeobject.h"
14
#include "bytesobject.h"
15
#include "fileobject.h"
16
#include "codecs.h"
17
#include "abstract.h"
18
19
/* Alternate tab spacing */
20
0
#define ALTTABSIZE 1
21
22
1.35k
#define is_potential_identifier_start(c) (\
23
1.35k
              (c >= 'a' && c <= 'z')\
24
1.35k
               || (c >= 'A' && c <= 'Z')\
25
1.35k
               || c == '_'\
26
1.35k
               || (c >= 128))
27
28
3.04k
#define is_potential_identifier_char(c) (\
29
3.04k
              (c >= 'a' && c <= 'z')\
30
3.04k
               || (c >= 'A' && c <= 'Z')\
31
3.04k
               || (c >= '0' && c <= '9')\
32
3.04k
               || c == '_'\
33
3.04k
               || (c >= 128))
34
35
extern char *PyOS_Readline(FILE *, FILE *, const char *);
36
/* Return malloc'ed string including trailing \n;
37
   empty malloc'ed string for EOF;
38
   NULL if interrupted */
39
40
/* Don't ever change this -- it would break the portability of Python code */
41
16
#define TABSIZE 8
42
43
/* Forward */
44
static struct tok_state *tok_new(void);
45
static int tok_nextc(struct tok_state *tok);
46
static void tok_backup(struct tok_state *tok, int c);
47
48
49
/* Spaces in this constant are treated as "zero or more spaces or tabs" when
50
   tokenizing. */
51
static const char* type_comment_prefix = "# type: ";
52
53
/* Create and initialize a new tok_state structure */
54
55
static struct tok_state *
56
tok_new(void)
57
16
{
58
16
    struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
59
16
                                            sizeof(struct tok_state));
60
16
    if (tok == NULL)
61
0
        return NULL;
62
16
    tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
63
16
    tok->done = E_OK;
64
16
    tok->fp = NULL;
65
16
    tok->input = NULL;
66
16
    tok->tabsize = TABSIZE;
67
16
    tok->indent = 0;
68
16
    tok->indstack[0] = 0;
69
70
16
    tok->atbol = 1;
71
16
    tok->pendin = 0;
72
16
    tok->prompt = tok->nextprompt = NULL;
73
16
    tok->lineno = 0;
74
16
    tok->level = 0;
75
16
    tok->altindstack[0] = 0;
76
16
    tok->decoding_state = STATE_INIT;
77
16
    tok->decoding_erred = 0;
78
16
    tok->read_coding_spec = 0;
79
16
    tok->enc = NULL;
80
16
    tok->encoding = NULL;
81
16
    tok->cont_line = 0;
82
16
    tok->filename = NULL;
83
16
    tok->decoding_readline = NULL;
84
16
    tok->decoding_buffer = NULL;
85
16
    tok->type_comments = 0;
86
87
16
    tok->async_hacks = 0;
88
16
    tok->async_def = 0;
89
16
    tok->async_def_indent = 0;
90
16
    tok->async_def_nl = 0;
91
92
16
    return tok;
93
16
}
94
95
static char *
96
new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
97
0
{
98
0
    char* result = (char *)PyMem_MALLOC(len + 1);
99
0
    if (!result) {
100
0
        tok->done = E_NOMEM;
101
0
        return NULL;
102
0
    }
103
0
    memcpy(result, s, len);
104
0
    result[len] = '\0';
105
0
    return result;
106
0
}
107
108
static char *
109
error_ret(struct tok_state *tok) /* XXX */
110
0
{
111
0
    tok->decoding_erred = 1;
112
0
    if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
113
0
        PyMem_FREE(tok->buf);
114
0
    tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
115
0
    tok->done = E_DECODE;
116
0
    return NULL;                /* as if it were EOF */
117
0
}
118
119
120
static const char *
121
get_normal_name(const char *s)  /* for utf-8 and latin-1 */
122
0
{
123
0
    char buf[13];
124
0
    int i;
125
0
    for (i = 0; i < 12; i++) {
126
0
        int c = s[i];
127
0
        if (c == '\0')
128
0
            break;
129
0
        else if (c == '_')
130
0
            buf[i] = '-';
131
0
        else
132
0
            buf[i] = tolower(c);
133
0
    }
134
0
    buf[i] = '\0';
135
0
    if (strcmp(buf, "utf-8") == 0 ||
136
0
        strncmp(buf, "utf-8-", 6) == 0)
137
0
        return "utf-8";
138
0
    else if (strcmp(buf, "latin-1") == 0 ||
139
0
             strcmp(buf, "iso-8859-1") == 0 ||
140
0
             strcmp(buf, "iso-latin-1") == 0 ||
141
0
             strncmp(buf, "latin-1-", 8) == 0 ||
142
0
             strncmp(buf, "iso-8859-1-", 11) == 0 ||
143
0
             strncmp(buf, "iso-latin-1-", 12) == 0)
144
0
        return "iso-8859-1";
145
0
    else
146
0
        return s;
147
0
}
148
149
/* Return the coding spec in S, or NULL if none is found.  */
150
151
static int
152
get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
153
28
{
154
28
    Py_ssize_t i;
155
28
    *spec = NULL;
156
    /* Coding spec must be in a comment, and that comment must be
157
     * the only statement on the source code line. */
158
28
    for (i = 0; i < size - 6; i++) {
159
4
        if (s[i] == '#')
160
0
            break;
161
4
        if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
162
4
            return 1;
163
4
    }
164
24
    for (; i < size - 6; i++) { /* XXX inefficient search */
165
0
        const char* t = s + i;
166
0
        if (strncmp(t, "coding", 6) == 0) {
167
0
            const char* begin = NULL;
168
0
            t += 6;
169
0
            if (t[0] != ':' && t[0] != '=')
170
0
                continue;
171
0
            do {
172
0
                t++;
173
0
            } while (t[0] == '\x20' || t[0] == '\t');
174
175
0
            begin = t;
176
0
            while (Py_ISALNUM(t[0]) ||
177
0
                   t[0] == '-' || t[0] == '_' || t[0] == '.')
178
0
                t++;
179
180
0
            if (begin < t) {
181
0
                char* r = new_string(begin, t - begin, tok);
182
0
                const char* q;
183
0
                if (!r)
184
0
                    return 0;
185
0
                q = get_normal_name(r);
186
0
                if (r != q) {
187
0
                    PyMem_FREE(r);
188
0
                    r = new_string(q, strlen(q), tok);
189
0
                    if (!r)
190
0
                        return 0;
191
0
                }
192
0
                *spec = r;
193
0
                break;
194
0
            }
195
0
        }
196
0
    }
197
24
    return 1;
198
24
}
199
200
/* Check whether the line contains a coding spec. If it does,
201
   invoke the set_readline function for the new encoding.
202
   This function receives the tok_state and the new encoding.
203
   Return 1 on success, 0 on failure.  */
204
205
static int
206
check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
207
                  int set_readline(struct tok_state *, const char *))
208
28
{
209
28
    char *cs;
210
28
    int r = 1;
211
212
28
    if (tok->cont_line) {
213
        /* It's a continuation line, so it can't be a coding spec. */
214
0
        tok->read_coding_spec = 1;
215
0
        return 1;
216
0
    }
217
28
    if (!get_coding_spec(line, &cs, size, tok))
218
0
        return 0;
219
28
    if (!cs) {
220
28
        Py_ssize_t i;
221
28
        for (i = 0; i < size; i++) {
222
14
            if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
223
0
                break;
224
14
            if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
225
                /* Stop checking coding spec after a line containing
226
                 * anything except a comment. */
227
14
                tok->read_coding_spec = 1;
228
14
                break;
229
14
            }
230
14
        }
231
28
        return 1;
232
28
    }
233
0
    tok->read_coding_spec = 1;
234
0
    if (tok->encoding == NULL) {
235
0
        assert(tok->decoding_state == STATE_RAW);
236
0
        if (strcmp(cs, "utf-8") == 0) {
237
0
            tok->encoding = cs;
238
0
        } else {
239
0
            r = set_readline(tok, cs);
240
0
            if (r) {
241
0
                tok->encoding = cs;
242
0
                tok->decoding_state = STATE_NORMAL;
243
0
            }
244
0
            else {
245
0
                PyErr_Format(PyExc_SyntaxError,
246
0
                             "encoding problem: %s", cs);
247
0
                PyMem_FREE(cs);
248
0
            }
249
0
        }
250
0
    } else {                /* then, compare cs with BOM */
251
0
        r = (strcmp(tok->encoding, cs) == 0);
252
0
        if (!r)
253
0
            PyErr_Format(PyExc_SyntaxError,
254
0
                         "encoding problem: %s with BOM", cs);
255
0
        PyMem_FREE(cs);
256
0
    }
257
0
    return r;
258
28
}
259
260
/* See whether the file starts with a BOM. If it does,
261
   invoke the set_readline function with the new encoding.
262
   Return 1 on success, 0 on failure.  */
263
264
static int
265
check_bom(int get_char(struct tok_state *),
266
          void unget_char(int, struct tok_state *),
267
          int set_readline(struct tok_state *, const char *),
268
          struct tok_state *tok)
269
14
{
270
14
    int ch1, ch2, ch3;
271
14
    ch1 = get_char(tok);
272
14
    tok->decoding_state = STATE_RAW;
273
14
    if (ch1 == EOF) {
274
0
        return 1;
275
14
    } else if (ch1 == 0xEF) {
276
0
        ch2 = get_char(tok);
277
0
        if (ch2 != 0xBB) {
278
0
            unget_char(ch2, tok);
279
0
            unget_char(ch1, tok);
280
0
            return 1;
281
0
        }
282
0
        ch3 = get_char(tok);
283
0
        if (ch3 != 0xBF) {
284
0
            unget_char(ch3, tok);
285
0
            unget_char(ch2, tok);
286
0
            unget_char(ch1, tok);
287
0
            return 1;
288
0
        }
289
#if 0
290
    /* Disable support for UTF-16 BOMs until a decision
291
       is made whether this needs to be supported.  */
292
    } else if (ch1 == 0xFE) {
293
        ch2 = get_char(tok);
294
        if (ch2 != 0xFF) {
295
            unget_char(ch2, tok);
296
            unget_char(ch1, tok);
297
            return 1;
298
        }
299
        if (!set_readline(tok, "utf-16-be"))
300
            return 0;
301
        tok->decoding_state = STATE_NORMAL;
302
    } else if (ch1 == 0xFF) {
303
        ch2 = get_char(tok);
304
        if (ch2 != 0xFE) {
305
            unget_char(ch2, tok);
306
            unget_char(ch1, tok);
307
            return 1;
308
        }
309
        if (!set_readline(tok, "utf-16-le"))
310
            return 0;
311
        tok->decoding_state = STATE_NORMAL;
312
#endif
313
14
    } else {
314
14
        unget_char(ch1, tok);
315
14
        return 1;
316
14
    }
317
0
    if (tok->encoding != NULL)
318
0
        PyMem_FREE(tok->encoding);
319
0
    tok->encoding = new_string("utf-8", 5, tok);
320
0
    if (!tok->encoding)
321
0
        return 0;
322
    /* No need to set_readline: input is already utf-8 */
323
0
    return 1;
324
0
}
325
326
/* Read a line of text from TOK into S, using the stream in TOK.
327
   Return NULL on failure, else S.
328
329
   On entry, tok->decoding_buffer will be one of:
330
     1) NULL: need to call tok->decoding_readline to get a new line
331
     2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
332
       stored the result in tok->decoding_buffer
333
     3) PyByteArrayObject *: previous call to fp_readl did not have enough room
334
       (in the s buffer) to copy entire contents of the line read
335
       by tok->decoding_readline.  tok->decoding_buffer has the overflow.
336
       In this case, fp_readl is called in a loop (with an expanded buffer)
337
       until the buffer ends with a '\n' (or until the end of the file is
338
       reached): see tok_nextc and its calls to decoding_fgets.
339
*/
340
341
static char *
342
fp_readl(char *s, int size, struct tok_state *tok)
343
0
{
344
0
    PyObject* bufobj;
345
0
    const char *buf;
346
0
    Py_ssize_t buflen;
347
348
    /* Ask for one less byte so we can terminate it */
349
0
    assert(size > 0);
350
0
    size--;
351
352
0
    if (tok->decoding_buffer) {
353
0
        bufobj = tok->decoding_buffer;
354
0
        Py_INCREF(bufobj);
355
0
    }
356
0
    else
357
0
    {
358
0
        bufobj = _PyObject_CallNoArg(tok->decoding_readline);
359
0
        if (bufobj == NULL)
360
0
            goto error;
361
0
    }
362
0
    if (PyUnicode_CheckExact(bufobj))
363
0
    {
364
0
        buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
365
0
        if (buf == NULL) {
366
0
            goto error;
367
0
        }
368
0
    }
369
0
    else
370
0
    {
371
0
        buf = PyByteArray_AsString(bufobj);
372
0
        if (buf == NULL) {
373
0
            goto error;
374
0
        }
375
0
        buflen = PyByteArray_GET_SIZE(bufobj);
376
0
    }
377
378
0
    Py_XDECREF(tok->decoding_buffer);
379
0
    if (buflen > size) {
380
        /* Too many chars, the rest goes into tok->decoding_buffer */
381
0
        tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
382
0
                                                         buflen-size);
383
0
        if (tok->decoding_buffer == NULL)
384
0
            goto error;
385
0
        buflen = size;
386
0
    }
387
0
    else
388
0
        tok->decoding_buffer = NULL;
389
390
0
    memcpy(s, buf, buflen);
391
0
    s[buflen] = '\0';
392
0
    if (buflen == 0) /* EOF */
393
0
        s = NULL;
394
0
    Py_DECREF(bufobj);
395
0
    return s;
396
397
0
error:
398
0
    Py_XDECREF(bufobj);
399
0
    return error_ret(tok);
400
0
}
401
402
/* Set the readline function for TOK to a StreamReader's
403
   readline function. The StreamReader is named ENC.
404
405
   This function is called from check_bom and check_coding_spec.
406
407
   ENC is usually identical to the future value of tok->encoding,
408
   except for the (currently unsupported) case of UTF-16.
409
410
   Return 1 on success, 0 on failure. */
411
412
static int
413
fp_setreadl(struct tok_state *tok, const char* enc)
414
0
{
415
0
    PyObject *readline, *io, *stream;
416
0
    _Py_IDENTIFIER(open);
417
0
    _Py_IDENTIFIER(readline);
418
0
    int fd;
419
0
    long pos;
420
421
0
    fd = fileno(tok->fp);
422
    /* Due to buffering the file offset for fd can be different from the file
423
     * position of tok->fp.  If tok->fp was opened in text mode on Windows,
424
     * its file position counts CRLF as one char and can't be directly mapped
425
     * to the file offset for fd.  Instead we step back one byte and read to
426
     * the end of line.*/
427
0
    pos = ftell(tok->fp);
428
0
    if (pos == -1 ||
429
0
        lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
430
0
        PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
431
0
        return 0;
432
0
    }
433
434
0
    io = PyImport_ImportModuleNoBlock("io");
435
0
    if (io == NULL)
436
0
        return 0;
437
438
0
    stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
439
0
                    fd, "r", -1, enc, Py_None, Py_None, Py_False);
440
0
    Py_DECREF(io);
441
0
    if (stream == NULL)
442
0
        return 0;
443
444
0
    readline = _PyObject_GetAttrId(stream, &PyId_readline);
445
0
    Py_DECREF(stream);
446
0
    if (readline == NULL)
447
0
        return 0;
448
0
    Py_XSETREF(tok->decoding_readline, readline);
449
450
0
    if (pos > 0) {
451
0
        PyObject *bufobj = _PyObject_CallNoArg(readline);
452
0
        if (bufobj == NULL)
453
0
            return 0;
454
0
        Py_DECREF(bufobj);
455
0
    }
456
457
0
    return 1;
458
0
}
459
460
/* Fetch the next byte from TOK. */
461
462
0
static int fp_getc(struct tok_state *tok) {
463
0
    return getc(tok->fp);
464
0
}
465
466
/* Unfetch the last byte back into TOK.  */
467
468
0
static void fp_ungetc(int c, struct tok_state *tok) {
469
0
    ungetc(c, tok->fp);
470
0
}
471
472
/* Check whether the characters at s start a valid
473
   UTF-8 sequence. Return the number of characters forming
474
   the sequence if yes, 0 if not.  */
475
static int valid_utf8(const unsigned char* s)
476
0
{
477
0
    int expected = 0;
478
0
    int length;
479
0
    if (*s < 0x80)
480
        /* single-byte code */
481
0
        return 1;
482
0
    if (*s < 0xc0)
483
        /* following byte */
484
0
        return 0;
485
0
    if (*s < 0xE0)
486
0
        expected = 1;
487
0
    else if (*s < 0xF0)
488
0
        expected = 2;
489
0
    else if (*s < 0xF8)
490
0
        expected = 3;
491
0
    else
492
0
        return 0;
493
0
    length = expected + 1;
494
0
    for (; expected; expected--)
495
0
        if (s[expected] < 0x80 || s[expected] >= 0xC0)
496
0
            return 0;
497
0
    return length;
498
0
}
499
500
/* Read a line of input from TOK. Determine encoding
501
   if necessary.  */
502
503
static char *
504
decoding_fgets(char *s, int size, struct tok_state *tok)
505
0
{
506
0
    char *line = NULL;
507
0
    int badchar = 0;
508
0
    for (;;) {
509
0
        if (tok->decoding_state == STATE_NORMAL) {
510
            /* We already have a codec associated with
511
               this input. */
512
0
            line = fp_readl(s, size, tok);
513
0
            break;
514
0
        } else if (tok->decoding_state == STATE_RAW) {
515
            /* We want a 'raw' read. */
516
0
            line = Py_UniversalNewlineFgets(s, size,
517
0
                                            tok->fp, NULL);
518
0
            break;
519
0
        } else {
520
            /* We have not yet determined the encoding.
521
               If an encoding is found, use the file-pointer
522
               reader functions from now on. */
523
0
            if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
524
0
                return error_ret(tok);
525
0
            assert(tok->decoding_state != STATE_INIT);
526
0
        }
527
0
    }
528
0
    if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
529
0
        if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
530
0
            return error_ret(tok);
531
0
        }
532
0
    }
533
    /* The default encoding is UTF-8, so make sure we don't have any
534
       non-UTF-8 sequences in it. */
535
0
    if (line && !tok->encoding) {
536
0
        unsigned char *c;
537
0
        int length;
538
0
        for (c = (unsigned char *)line; *c; c += length)
539
0
            if (!(length = valid_utf8(c))) {
540
0
                badchar = *c;
541
0
                break;
542
0
            }
543
0
    }
544
0
    if (badchar) {
545
        /* Need to add 1 to the line number, since this line
546
           has not been counted, yet.  */
547
0
        PyErr_Format(PyExc_SyntaxError,
548
0
                "Non-UTF-8 code starting with '\\x%.2x' "
549
0
                "in file %U on line %i, "
550
0
                "but no encoding declared; "
551
0
                "see http://python.org/dev/peps/pep-0263/ for details",
552
0
                badchar, tok->filename, tok->lineno + 1);
553
0
        return error_ret(tok);
554
0
    }
555
0
    return line;
556
0
}
557
558
static int
559
decoding_feof(struct tok_state *tok)
560
0
{
561
0
    if (tok->decoding_state != STATE_NORMAL) {
562
0
        return feof(tok->fp);
563
0
    } else {
564
0
        PyObject* buf = tok->decoding_buffer;
565
0
        if (buf == NULL) {
566
0
            buf = _PyObject_CallNoArg(tok->decoding_readline);
567
0
            if (buf == NULL) {
568
0
                error_ret(tok);
569
0
                return 1;
570
0
            } else {
571
0
                tok->decoding_buffer = buf;
572
0
            }
573
0
        }
574
0
        return PyObject_Length(buf) == 0;
575
0
    }
576
0
}
577
578
/* Fetch a byte from TOK, using the string buffer. */
579
580
static int
581
14
buf_getc(struct tok_state *tok) {
582
14
    return Py_CHARMASK(*tok->str++);
583
14
}
584
585
/* Unfetch a byte from TOK, using the string buffer. */
586
587
static void
588
14
buf_ungetc(int c, struct tok_state *tok) {
589
14
    tok->str--;
590
14
    assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
591
14
}
592
593
/* Set the readline function for TOK to ENC. For the string-based
594
   tokenizer, this means to just record the encoding. */
595
596
static int
597
0
buf_setreadl(struct tok_state *tok, const char* enc) {
598
0
    tok->enc = enc;
599
0
    return 1;
600
0
}
601
602
/* Return a UTF-8 encoding Python string object from the
603
   C byte string STR, which is encoded with ENC. */
604
605
static PyObject *
606
0
translate_into_utf8(const char* str, const char* enc) {
607
0
    PyObject *utf8;
608
0
    PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
609
0
    if (buf == NULL)
610
0
        return NULL;
611
0
    utf8 = PyUnicode_AsUTF8String(buf);
612
0
    Py_DECREF(buf);
613
0
    return utf8;
614
0
}
615
616
617
static char *
618
16
translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
619
16
    int skip_next_lf = 0;
620
16
    size_t needed_length = strlen(s) + 2, final_length;
621
16
    char *buf, *current;
622
16
    char c = '\0';
623
16
    buf = PyMem_MALLOC(needed_length);
624
16
    if (buf == NULL) {
625
0
        tok->done = E_NOMEM;
626
0
        return NULL;
627
0
    }
628
7.53k
    for (current = buf; *s; s++, current++) {
629
7.51k
        c = *s;
630
7.51k
        if (skip_next_lf) {
631
0
            skip_next_lf = 0;
632
0
            if (c == '\n') {
633
0
                c = *++s;
634
0
                if (!c)
635
0
                    break;
636
0
            }
637
0
        }
638
7.51k
        if (c == '\r') {
639
0
            skip_next_lf = 1;
640
0
            c = '\n';
641
0
        }
642
7.51k
        *current = c;
643
7.51k
    }
644
    /* If this is exec input, add a newline to the end of the string if
645
       there isn't one already. */
646
16
    if (exec_input && c != '\n') {
647
2
        *current = '\n';
648
2
        current++;
649
2
    }
650
16
    *current = '\0';
651
16
    final_length = current - buf + 1;
652
16
    if (final_length < needed_length && final_length) {
653
        /* should never fail */
654
14
        char* result = PyMem_REALLOC(buf, final_length);
655
14
        if (result == NULL) {
656
0
            PyMem_FREE(buf);
657
0
        }
658
14
        buf = result;
659
14
    }
660
16
    return buf;
661
16
}
662
663
/* Decode a byte string STR for use as the buffer of TOK.
664
   Look for encoding declarations inside STR, and record them
665
   inside TOK.  */
666
667
static const char *
668
decode_str(const char *input, int single, struct tok_state *tok)
669
14
{
670
14
    PyObject* utf8 = NULL;
671
14
    const char *str;
672
14
    const char *s;
673
14
    const char *newl[2] = {NULL, NULL};
674
14
    int lineno = 0;
675
14
    tok->input = str = translate_newlines(input, single, tok);
676
14
    if (str == NULL)
677
0
        return NULL;
678
14
    tok->enc = NULL;
679
14
    tok->str = str;
680
14
    if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
681
0
        return error_ret(tok);
682
14
    str = tok->str;             /* string after BOM if any */
683
14
    assert(str);
684
14
    if (tok->enc != NULL) {
685
0
        utf8 = translate_into_utf8(str, tok->enc);
686
0
        if (utf8 == NULL)
687
0
            return error_ret(tok);
688
0
        str = PyBytes_AsString(utf8);
689
0
    }
690
174
    for (s = str;; s++) {
691
174
        if (*s == '\0') break;
692
174
        else if (*s == '\n') {
693
28
            assert(lineno < 2);
694
28
            newl[lineno] = s;
695
28
            lineno++;
696
28
            if (lineno == 2) break;
697
28
        }
698
174
    }
699
14
    tok->enc = NULL;
700
    /* need to check line 1 and 2 separately since check_coding_spec
701
       assumes a single line as input */
702
14
    if (newl[0]) {
703
14
        if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
704
0
            return error_ret(tok);
705
14
        if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
706
14
            if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
707
14
                                   tok, buf_setreadl))
708
0
                return error_ret(tok);
709
14
        }
710
14
    }
711
14
    if (tok->enc != NULL) {
712
0
        assert(utf8 == NULL);
713
0
        utf8 = translate_into_utf8(str, tok->enc);
714
0
        if (utf8 == NULL)
715
0
            return error_ret(tok);
716
0
        str = PyBytes_AS_STRING(utf8);
717
0
    }
718
14
    assert(tok->decoding_buffer == NULL);
719
14
    tok->decoding_buffer = utf8; /* CAUTION */
720
14
    return str;
721
14
}
722
723
/* Set up tokenizer for string */
724
725
struct tok_state *
726
PyTokenizer_FromString(const char *str, int exec_input)
727
14
{
728
14
    struct tok_state *tok = tok_new();
729
14
    if (tok == NULL)
730
0
        return NULL;
731
14
    str = decode_str(str, exec_input, tok);
732
14
    if (str == NULL) {
733
0
        PyTokenizer_Free(tok);
734
0
        return NULL;
735
0
    }
736
737
    /* XXX: constify members. */
738
14
    tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
739
14
    return tok;
740
14
}
741
742
struct tok_state *
743
PyTokenizer_FromUTF8(const char *str, int exec_input)
744
2
{
745
2
    struct tok_state *tok = tok_new();
746
2
    if (tok == NULL)
747
0
        return NULL;
748
2
    tok->input = str = translate_newlines(str, exec_input, tok);
749
2
    if (str == NULL) {
750
0
        PyTokenizer_Free(tok);
751
0
        return NULL;
752
0
    }
753
2
    tok->decoding_state = STATE_RAW;
754
2
    tok->read_coding_spec = 1;
755
2
    tok->enc = NULL;
756
2
    tok->str = str;
757
2
    tok->encoding = (char *)PyMem_MALLOC(6);
758
2
    if (!tok->encoding) {
759
0
        PyTokenizer_Free(tok);
760
0
        return NULL;
761
0
    }
762
2
    strcpy(tok->encoding, "utf-8");
763
764
    /* XXX: constify members. */
765
2
    tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
766
2
    return tok;
767
2
}
768
769
/* Set up tokenizer for file */
770
771
struct tok_state *
772
PyTokenizer_FromFile(FILE *fp, const char* enc,
773
                     const char *ps1, const char *ps2)
774
0
{
775
0
    struct tok_state *tok = tok_new();
776
0
    if (tok == NULL)
777
0
        return NULL;
778
0
    if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
779
0
        PyTokenizer_Free(tok);
780
0
        return NULL;
781
0
    }
782
0
    tok->cur = tok->inp = tok->buf;
783
0
    tok->end = tok->buf + BUFSIZ;
784
0
    tok->fp = fp;
785
0
    tok->prompt = ps1;
786
0
    tok->nextprompt = ps2;
787
0
    if (enc != NULL) {
788
        /* Must copy encoding declaration since it
789
           gets copied into the parse tree. */
790
0
        tok->encoding = PyMem_MALLOC(strlen(enc)+1);
791
0
        if (!tok->encoding) {
792
0
            PyTokenizer_Free(tok);
793
0
            return NULL;
794
0
        }
795
0
        strcpy(tok->encoding, enc);
796
0
        tok->decoding_state = STATE_NORMAL;
797
0
    }
798
0
    return tok;
799
0
}
800
801
802
/* Free a tok_state structure */
803
804
void
805
PyTokenizer_Free(struct tok_state *tok)
806
16
{
807
16
    if (tok->encoding != NULL)
808
0
        PyMem_FREE(tok->encoding);
809
16
    Py_XDECREF(tok->decoding_readline);
810
16
    Py_XDECREF(tok->decoding_buffer);
811
16
    Py_XDECREF(tok->filename);
812
16
    if (tok->fp != NULL && tok->buf != NULL)
813
0
        PyMem_FREE(tok->buf);
814
16
    if (tok->input)
815
16
        PyMem_FREE((char *)tok->input);
816
16
    PyMem_FREE(tok);
817
16
}
818
819
/* Get next char, updating state; error code goes into tok->done */
820
821
static int
822
tok_nextc(struct tok_state *tok)
823
10.3k
{
824
10.3k
    for (;;) {
825
10.3k
        if (tok->cur != tok->inp) {
826
9.98k
            return Py_CHARMASK(*tok->cur++); /* Fast path */
827
9.98k
        }
828
332
        if (tok->done != E_OK)
829
64
            return EOF;
830
268
        if (tok->fp == NULL) {
831
268
            char *end = strchr(tok->inp, '\n');
832
268
            if (end != NULL)
833
252
                end++;
834
16
            else {
835
16
                end = strchr(tok->inp, '\0');
836
16
                if (end == tok->inp) {
837
16
                    tok->done = E_EOF;
838
16
                    return EOF;
839
16
                }
840
16
            }
841
252
            if (tok->start == NULL)
842
252
                tok->buf = tok->cur;
843
252
            tok->line_start = tok->cur;
844
252
            tok->lineno++;
845
252
            tok->inp = end;
846
252
            return Py_CHARMASK(*tok->cur++);
847
268
        }
848
0
        if (tok->prompt != NULL) {
849
0
            char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
850
0
            if (newtok != NULL) {
851
0
                char *translated = translate_newlines(newtok, 0, tok);
852
0
                PyMem_FREE(newtok);
853
0
                if (translated == NULL)
854
0
                    return EOF;
855
0
                newtok = translated;
856
0
            }
857
0
            if (tok->encoding && newtok && *newtok) {
858
                /* Recode to UTF-8 */
859
0
                Py_ssize_t buflen;
860
0
                const char* buf;
861
0
                PyObject *u = translate_into_utf8(newtok, tok->encoding);
862
0
                PyMem_FREE(newtok);
863
0
                if (!u) {
864
0
                    tok->done = E_DECODE;
865
0
                    return EOF;
866
0
                }
867
0
                buflen = PyBytes_GET_SIZE(u);
868
0
                buf = PyBytes_AS_STRING(u);
869
0
                newtok = PyMem_MALLOC(buflen+1);
870
0
                if (newtok == NULL) {
871
0
                    Py_DECREF(u);
872
0
                    tok->done = E_NOMEM;
873
0
                    return EOF;
874
0
                }
875
0
                strcpy(newtok, buf);
876
0
                Py_DECREF(u);
877
0
            }
878
0
            if (tok->nextprompt != NULL)
879
0
                tok->prompt = tok->nextprompt;
880
0
            if (newtok == NULL)
881
0
                tok->done = E_INTR;
882
0
            else if (*newtok == '\0') {
883
0
                PyMem_FREE(newtok);
884
0
                tok->done = E_EOF;
885
0
            }
886
0
            else if (tok->start != NULL) {
887
0
                size_t start = tok->start - tok->buf;
888
0
                size_t oldlen = tok->cur - tok->buf;
889
0
                size_t newlen = oldlen + strlen(newtok);
890
0
                Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
891
0
                char *buf = tok->buf;
892
0
                buf = (char *)PyMem_REALLOC(buf, newlen+1);
893
0
                tok->lineno++;
894
0
                if (buf == NULL) {
895
0
                    PyMem_FREE(tok->buf);
896
0
                    tok->buf = NULL;
897
0
                    PyMem_FREE(newtok);
898
0
                    tok->done = E_NOMEM;
899
0
                    return EOF;
900
0
                }
901
0
                tok->buf = buf;
902
0
                tok->cur = tok->buf + oldlen;
903
0
                tok->multi_line_start = tok->buf + cur_multi_line_start;
904
0
                tok->line_start = tok->cur;
905
0
                strcpy(tok->buf + oldlen, newtok);
906
0
                PyMem_FREE(newtok);
907
0
                tok->inp = tok->buf + newlen;
908
0
                tok->end = tok->inp + 1;
909
0
                tok->start = tok->buf + start;
910
0
            }
911
0
            else {
912
0
                tok->lineno++;
913
0
                if (tok->buf != NULL)
914
0
                    PyMem_FREE(tok->buf);
915
0
                tok->buf = newtok;
916
0
                tok->cur = tok->buf;
917
0
                tok->line_start = tok->buf;
918
0
                tok->inp = strchr(tok->buf, '\0');
919
0
                tok->end = tok->inp + 1;
920
0
            }
921
0
        }
922
0
        else {
923
0
            int done = 0;
924
0
            Py_ssize_t cur = 0;
925
0
            char *pt;
926
0
            if (tok->start == NULL) {
927
0
                if (tok->buf == NULL) {
928
0
                    tok->buf = (char *)
929
0
                        PyMem_MALLOC(BUFSIZ);
930
0
                    if (tok->buf == NULL) {
931
0
                        tok->done = E_NOMEM;
932
0
                        return EOF;
933
0
                    }
934
0
                    tok->end = tok->buf + BUFSIZ;
935
0
                }
936
0
                if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
937
0
                          tok) == NULL) {
938
0
                    if (!tok->decoding_erred)
939
0
                        tok->done = E_EOF;
940
0
                    done = 1;
941
0
                }
942
0
                else {
943
0
                    tok->done = E_OK;
944
0
                    tok->inp = strchr(tok->buf, '\0');
945
0
                    done = tok->inp == tok->buf || tok->inp[-1] == '\n';
946
0
                }
947
0
            }
948
0
            else {
949
0
                cur = tok->cur - tok->buf;
950
0
                if (decoding_feof(tok)) {
951
0
                    tok->done = E_EOF;
952
0
                    done = 1;
953
0
                }
954
0
                else
955
0
                    tok->done = E_OK;
956
0
            }
957
0
            tok->lineno++;
958
            /* Read until '\n' or EOF */
959
0
            while (!done) {
960
0
                Py_ssize_t curstart = tok->start == NULL ? -1 :
961
0
                          tok->start - tok->buf;
962
0
                Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
963
0
                Py_ssize_t curvalid = tok->inp - tok->buf;
964
0
                Py_ssize_t newsize = curvalid + BUFSIZ;
965
0
                char *newbuf = tok->buf;
966
0
                newbuf = (char *)PyMem_REALLOC(newbuf,
967
0
                                               newsize);
968
0
                if (newbuf == NULL) {
969
0
                    tok->done = E_NOMEM;
970
0
                    tok->cur = tok->inp;
971
0
                    return EOF;
972
0
                }
973
0
                tok->buf = newbuf;
974
0
                tok->cur = tok->buf + cur;
975
0
                tok->multi_line_start = tok->buf + cur_multi_line_start;
976
0
                tok->line_start = tok->cur;
977
0
                tok->inp = tok->buf + curvalid;
978
0
                tok->end = tok->buf + newsize;
979
0
                tok->start = curstart < 0 ? NULL :
980
0
                         tok->buf + curstart;
981
0
                if (decoding_fgets(tok->inp,
982
0
                               (int)(tok->end - tok->inp),
983
0
                               tok) == NULL) {
984
                    /* Break out early on decoding
985
                       errors, as tok->buf will be NULL
986
                     */
987
0
                    if (tok->decoding_erred)
988
0
                        return EOF;
989
                    /* Last line does not end in \n,
990
                       fake one */
991
0
                    if (tok->inp[-1] != '\n')
992
0
                        strcpy(tok->inp, "\n");
993
0
                }
994
0
                tok->inp = strchr(tok->inp, '\0');
995
0
                done = tok->inp[-1] == '\n';
996
0
            }
997
0
            if (tok->buf != NULL) {
998
0
                tok->cur = tok->buf + cur;
999
0
                tok->line_start = tok->cur;
1000
                /* replace "\r\n" with "\n" */
1001
                /* For Mac leave the \r, giving a syntax error */
1002
0
                pt = tok->inp - 2;
1003
0
                if (pt >= tok->buf && *pt == '\r') {
1004
0
                    *pt++ = '\n';
1005
0
                    *pt = '\0';
1006
0
                    tok->inp = pt;
1007
0
                }
1008
0
            }
1009
0
        }
1010
0
        if (tok->done != E_OK) {
1011
0
            if (tok->prompt != NULL)
1012
0
                PySys_WriteStderr("\n");
1013
0
            tok->cur = tok->inp;
1014
0
            return EOF;
1015
0
        }
1016
0
    }
1017
    /*NOTREACHED*/
1018
10.3k
}
1019
1020
1021
/* Back-up one character */
1022
1023
static void
1024
tok_backup(struct tok_state *tok, int c)
1025
2.76k
{
1026
2.76k
    if (c != EOF) {
1027
2.71k
        if (--tok->cur < tok->buf)
1028
0
            Py_FatalError("tok_backup: beginning of buffer");
1029
2.71k
        if (*tok->cur != c)
1030
0
            *tok->cur = c;
1031
2.71k
    }
1032
2.76k
}
1033
1034
1035
static int
1036
syntaxerror(struct tok_state *tok, const char *format, ...)
1037
0
{
1038
0
    PyObject *errmsg, *errtext, *args;
1039
0
    va_list vargs;
1040
0
#ifdef HAVE_STDARG_PROTOTYPES
1041
0
    va_start(vargs, format);
1042
#else
1043
    va_start(vargs);
1044
#endif
1045
0
    errmsg = PyUnicode_FromFormatV(format, vargs);
1046
0
    va_end(vargs);
1047
0
    if (!errmsg) {
1048
0
        goto error;
1049
0
    }
1050
1051
0
    errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1052
0
                                   "replace");
1053
0
    if (!errtext) {
1054
0
        goto error;
1055
0
    }
1056
0
    int offset = (int)PyUnicode_GET_LENGTH(errtext);
1057
0
    Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1058
0
    if (line_len != tok->cur - tok->line_start) {
1059
0
        Py_DECREF(errtext);
1060
0
        errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1061
0
                                       "replace");
1062
0
    }
1063
0
    if (!errtext) {
1064
0
        goto error;
1065
0
    }
1066
1067
0
    args = Py_BuildValue("(O(OiiN))", errmsg,
1068
0
                         tok->filename, tok->lineno, offset, errtext);
1069
0
    if (args) {
1070
0
        PyErr_SetObject(PyExc_SyntaxError, args);
1071
0
        Py_DECREF(args);
1072
0
    }
1073
1074
0
error:
1075
0
    Py_XDECREF(errmsg);
1076
0
    tok->done = E_ERROR;
1077
0
    return ERRORTOKEN;
1078
0
}
1079
1080
static int
1081
indenterror(struct tok_state *tok)
1082
0
{
1083
0
    tok->done = E_TABSPACE;
1084
0
    tok->cur = tok->inp;
1085
0
    return ERRORTOKEN;
1086
0
}
1087
1088
/* Verify that the identifier follows PEP 3131.
1089
   All identifier strings are guaranteed to be "ready" unicode objects.
1090
 */
1091
static int
1092
verify_identifier(struct tok_state *tok)
1093
0
{
1094
0
    PyObject *s;
1095
0
    int result;
1096
0
    if (tok->decoding_erred)
1097
0
        return 0;
1098
0
    s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1099
0
    if (s == NULL) {
1100
0
        if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1101
0
            PyErr_Clear();
1102
0
            tok->done = E_IDENTIFIER;
1103
0
        } else {
1104
0
            tok->done = E_ERROR;
1105
0
        }
1106
0
        return 0;
1107
0
    }
1108
0
    result = PyUnicode_IsIdentifier(s);
1109
0
    Py_DECREF(s);
1110
0
    if (result == 0)
1111
0
        tok->done = E_IDENTIFIER;
1112
0
    return result;
1113
0
}
1114
1115
static int
1116
tok_decimal_tail(struct tok_state *tok)
1117
18
{
1118
18
    int c;
1119
1120
18
    while (1) {
1121
30
        do {
1122
30
            c = tok_nextc(tok);
1123
30
        } while (isdigit(c));
1124
18
        if (c != '_') {
1125
18
            break;
1126
18
        }
1127
0
        c = tok_nextc(tok);
1128
0
        if (!isdigit(c)) {
1129
0
            tok_backup(tok, c);
1130
0
            syntaxerror(tok, "invalid decimal literal");
1131
0
            return 0;
1132
0
        }
1133
0
    }
1134
18
    return c;
1135
18
}
1136
1137
/* Get next token, after space stripping etc. */
1138
1139
static int
1140
tok_get(struct tok_state *tok, char **p_start, char **p_end)
1141
1.41k
{
1142
1.41k
    int c;
1143
1.41k
    int blankline, nonascii;
1144
1145
1.41k
    *p_start = *p_end = NULL;
1146
1.50k
  nextline:
1147
1.50k
    tok->start = NULL;
1148
1.50k
    blankline = 0;
1149
1150
    /* Get indentation level */
1151
1.50k
    if (tok->atbol) {
1152
268
        int col = 0;
1153
268
        int altcol = 0;
1154
268
        tok->atbol = 0;
1155
1.43k
        for (;;) {
1156
1.43k
            c = tok_nextc(tok);
1157
1.43k
            if (c == ' ') {
1158
1.16k
                col++, altcol++;
1159
1.16k
            }
1160
268
            else if (c == '\t') {
1161
0
                col = (col / tok->tabsize + 1) * tok->tabsize;
1162
0
                altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1163
0
            }
1164
268
            else if (c == '\014')  {/* Control-L (formfeed) */
1165
0
                col = altcol = 0; /* For Emacs users */
1166
0
            }
1167
268
            else {
1168
268
                break;
1169
268
            }
1170
1.43k
        }
1171
268
        tok_backup(tok, c);
1172
268
        if (c == '#' || c == '\n') {
1173
            /* Lines with only whitespace and/or comments
1174
               shouldn't affect the indentation and are
1175
               not passed to the parser as NEWLINE tokens,
1176
               except *totally* empty lines in interactive
1177
               mode, which signal the end of a command group. */
1178
78
            if (col == 0 && c == '\n' && tok->prompt != NULL) {
1179
0
                blankline = 0; /* Let it through */
1180
0
            }
1181
78
            else if (tok->prompt != NULL && tok->lineno == 1) {
1182
                /* In interactive mode, if the first line contains
1183
                   only spaces and/or a comment, let it through. */
1184
0
                blankline = 0;
1185
0
                col = altcol = 0;
1186
0
            }
1187
78
            else {
1188
78
                blankline = 1; /* Ignore completely */
1189
78
            }
1190
            /* We can't jump back right here since we still
1191
               may need to skip to the end of a comment */
1192
78
        }
1193
268
        if (!blankline && tok->level == 0) {
1194
176
            if (col == tok->indstack[tok->indent]) {
1195
                /* No change */
1196
74
                if (altcol != tok->altindstack[tok->indent]) {
1197
0
                    return indenterror(tok);
1198
0
                }
1199
74
            }
1200
102
            else if (col > tok->indstack[tok->indent]) {
1201
                /* Indent -- always one */
1202
58
                if (tok->indent+1 >= MAXINDENT) {
1203
0
                    tok->done = E_TOODEEP;
1204
0
                    tok->cur = tok->inp;
1205
0
                    return ERRORTOKEN;
1206
0
                }
1207
58
                if (altcol <= tok->altindstack[tok->indent]) {
1208
0
                    return indenterror(tok);
1209
0
                }
1210
58
                tok->pendin++;
1211
58
                tok->indstack[++tok->indent] = col;
1212
58
                tok->altindstack[tok->indent] = altcol;
1213
58
            }
1214
44
            else /* col < tok->indstack[tok->indent] */ {
1215
                /* Dedent -- any number, must be consistent */
1216
102
                while (tok->indent > 0 &&
1217
102
                    col < tok->indstack[tok->indent]) {
1218
58
                    tok->pendin--;
1219
58
                    tok->indent--;
1220
58
                }
1221
44
                if (col != tok->indstack[tok->indent]) {
1222
0
                    tok->done = E_DEDENT;
1223
0
                    tok->cur = tok->inp;
1224
0
                    return ERRORTOKEN;
1225
0
                }
1226
44
                if (altcol != tok->altindstack[tok->indent]) {
1227
0
                    return indenterror(tok);
1228
0
                }
1229
44
            }
1230
176
        }
1231
268
    }
1232
1233
1.50k
    tok->start = tok->cur;
1234
1235
    /* Return pending indents/dedents */
1236
1.50k
    if (tok->pendin != 0) {
1237
116
        if (tok->pendin < 0) {
1238
58
            tok->pendin++;
1239
58
            return DEDENT;
1240
58
        }
1241
58
        else {
1242
58
            tok->pendin--;
1243
58
            return INDENT;
1244
58
        }
1245
116
    }
1246
1247
    /* Peek ahead at the next character */
1248
1.38k
    c = tok_nextc(tok);
1249
1.38k
    tok_backup(tok, c);
1250
    /* Check if we are closing an async function */
1251
1.38k
    if (tok->async_def
1252
1.38k
        && !blankline
1253
        /* Due to some implementation artifacts of type comments,
1254
         * a TYPE_COMMENT at the start of a function won't set an
1255
         * indentation level and it will produce a NEWLINE after it.
1256
         * To avoid spuriously ending an async function due to this,
1257
         * wait until we have some non-newline char in front of us. */
1258
1.38k
        && c != '\n'
1259
1.38k
        && tok->level == 0
1260
        /* There was a NEWLINE after ASYNC DEF,
1261
           so we're past the signature. */
1262
1.38k
        && tok->async_def_nl
1263
        /* Current indentation level is less than where
1264
           the async function was defined */
1265
1.38k
        && tok->async_def_indent >= tok->indent)
1266
0
    {
1267
0
        tok->async_def = 0;
1268
0
        tok->async_def_indent = 0;
1269
0
        tok->async_def_nl = 0;
1270
0
    }
1271
1272
1.38k
 again:
1273
1.38k
    tok->start = NULL;
1274
    /* Skip spaces */
1275
1.72k
    do {
1276
1.72k
        c = tok_nextc(tok);
1277
1.72k
    } while (c == ' ' || c == '\t' || c == '\014');
1278
1279
    /* Set start of current token */
1280
1.38k
    tok->start = tok->cur - 1;
1281
1282
    /* Skip comment, unless it's a type comment */
1283
1.38k
    if (c == '#') {
1284
66
        const char *prefix, *p, *type_start;
1285
1286
2.32k
        while (c != EOF && c != '\n') {
1287
2.25k
            c = tok_nextc(tok);
1288
2.25k
        }
1289
1290
66
        if (tok->type_comments) {
1291
0
            p = tok->start;
1292
0
            prefix = type_comment_prefix;
1293
0
            while (*prefix && p < tok->cur) {
1294
0
                if (*prefix == ' ') {
1295
0
                    while (*p == ' ' || *p == '\t') {
1296
0
                        p++;
1297
0
                    }
1298
0
                } else if (*prefix == *p) {
1299
0
                    p++;
1300
0
                } else {
1301
0
                    break;
1302
0
                }
1303
1304
0
                prefix++;
1305
0
            }
1306
1307
            /* This is a type comment if we matched all of type_comment_prefix. */
1308
0
            if (!*prefix) {
1309
0
                int is_type_ignore = 1;
1310
0
                const char *ignore_end = p + 6;
1311
0
                tok_backup(tok, c);  /* don't eat the newline or EOF */
1312
1313
0
                type_start = p;
1314
1315
                /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1316
                 * or anything ASCII and non-alphanumeric. */
1317
0
                is_type_ignore = (
1318
0
                    tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1319
0
                    && !(tok->cur > ignore_end
1320
0
                         && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1321
1322
0
                if (is_type_ignore) {
1323
0
                    *p_start = (char *) ignore_end;
1324
0
                    *p_end = tok->cur;
1325
1326
                    /* If this type ignore is the only thing on the line, consume the newline also. */
1327
0
                    if (blankline) {
1328
0
                        tok_nextc(tok);
1329
0
                        tok->atbol = 1;
1330
0
                    }
1331
0
                    return TYPE_IGNORE;
1332
0
                } else {
1333
0
                    *p_start = (char *) type_start;  /* after type_comment_prefix */
1334
0
                    *p_end = tok->cur;
1335
0
                    return TYPE_COMMENT;
1336
0
                }
1337
0
            }
1338
0
        }
1339
66
    }
1340
1341
    /* Check for EOF and errors now */
1342
1.38k
    if (c == EOF) {
1343
32
        return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1344
32
    }
1345
1346
    /* Identifier (most frequent token!) */
1347
1.35k
    nonascii = 0;
1348
1.35k
    if (is_potential_identifier_start(c)) {
1349
        /* Process the various legal combinations of b"", r"", u"", and f"". */
1350
530
        int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1351
584
        while (1) {
1352
584
            if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1353
6
                saw_b = 1;
1354
            /* Since this is a backwards compatibility support literal we don't
1355
               want to support it in arbitrary order like byte literals. */
1356
578
            else if (!(saw_b || saw_u || saw_r || saw_f)
1357
578
                     && (c == 'u'|| c == 'U')) {
1358
2
                saw_u = 1;
1359
2
            }
1360
            /* ur"" and ru"" are not supported */
1361
576
            else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1362
22
                saw_r = 1;
1363
22
            }
1364
554
            else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1365
30
                saw_f = 1;
1366
30
            }
1367
524
            else {
1368
524
                break;
1369
524
            }
1370
60
            c = tok_nextc(tok);
1371
60
            if (c == '"' || c == '\'') {
1372
6
                goto letter_quote;
1373
6
            }
1374
60
        }
1375
3.04k
        while (is_potential_identifier_char(c)) {
1376
2.52k
            if (c >= 128) {
1377
0
                nonascii = 1;
1378
0
            }
1379
2.52k
            c = tok_nextc(tok);
1380
2.52k
        }
1381
524
        tok_backup(tok, c);
1382
524
        if (nonascii && !verify_identifier(tok)) {
1383
0
            return ERRORTOKEN;
1384
0
        }
1385
524
        *p_start = tok->start;
1386
524
        *p_end = tok->cur;
1387
1388
        /* async/await parsing block. */
1389
524
        if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1390
            /* May be an 'async' or 'await' token.  For Python 3.7 or
1391
               later we recognize them unconditionally.  For Python
1392
               3.5 or 3.6 we recognize 'async' in front of 'def', and
1393
               either one inside of 'async def'.  (Technically we
1394
               shouldn't recognize these at all for 3.4 or earlier,
1395
               but there's no *valid* Python 3.4 code that would be
1396
               rejected, and async functions will be rejected in a
1397
               later phase.) */
1398
0
            if (!tok->async_hacks || tok->async_def) {
1399
                /* Always recognize the keywords. */
1400
0
                if (memcmp(tok->start, "async", 5) == 0) {
1401
0
                    return ASYNC;
1402
0
                }
1403
0
                if (memcmp(tok->start, "await", 5) == 0) {
1404
0
                    return AWAIT;
1405
0
                }
1406
0
            }
1407
0
            else if (memcmp(tok->start, "async", 5) == 0) {
1408
                /* The current token is 'async'.
1409
                   Look ahead one token to see if that is 'def'. */
1410
1411
0
                struct tok_state ahead_tok;
1412
0
                char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
1413
0
                int ahead_tok_kind;
1414
1415
0
                memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1416
0
                ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1417
0
                                         &ahead_tok_end);
1418
1419
0
                if (ahead_tok_kind == NAME
1420
0
                    && ahead_tok.cur - ahead_tok.start == 3
1421
0
                    && memcmp(ahead_tok.start, "def", 3) == 0)
1422
0
                {
1423
                    /* The next token is going to be 'def', so instead of
1424
                       returning a plain NAME token, return ASYNC. */
1425
0
                    tok->async_def_indent = tok->indent;
1426
0
                    tok->async_def = 1;
1427
0
                    return ASYNC;
1428
0
                }
1429
0
            }
1430
0
        }
1431
1432
524
        return NAME;
1433
524
    }
1434
1435
    /* Newline */
1436
826
    if (c == '\n') {
1437
252
        tok->atbol = 1;
1438
252
        if (blankline || tok->level > 0) {
1439
92
            goto nextline;
1440
92
        }
1441
160
        *p_start = tok->start;
1442
160
        *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1443
160
        tok->cont_line = 0;
1444
160
        if (tok->async_def) {
1445
            /* We're somewhere inside an 'async def' function, and
1446
               we've encountered a NEWLINE after its signature. */
1447
0
            tok->async_def_nl = 1;
1448
0
        }
1449
160
        return NEWLINE;
1450
252
    }
1451
1452
    /* Period or number starting with period? */
1453
574
    if (c == '.') {
1454
60
        c = tok_nextc(tok);
1455
60
        if (isdigit(c)) {
1456
0
            goto fraction;
1457
60
        } else if (c == '.') {
1458
0
            c = tok_nextc(tok);
1459
0
            if (c == '.') {
1460
0
                *p_start = tok->start;
1461
0
                *p_end = tok->cur;
1462
0
                return ELLIPSIS;
1463
0
            }
1464
0
            else {
1465
0
                tok_backup(tok, c);
1466
0
            }
1467
0
            tok_backup(tok, '.');
1468
0
        }
1469
60
        else {
1470
60
            tok_backup(tok, c);
1471
60
        }
1472
60
        *p_start = tok->start;
1473
60
        *p_end = tok->cur;
1474
60
        return DOT;
1475
60
    }
1476
1477
    /* Number */
1478
514
    if (isdigit(c)) {
1479
30
        if (c == '0') {
1480
            /* Hex, octal or binary -- maybe. */
1481
14
            c = tok_nextc(tok);
1482
14
            if (c == 'x' || c == 'X') {
1483
                /* Hex */
1484
0
                c = tok_nextc(tok);
1485
0
                do {
1486
0
                    if (c == '_') {
1487
0
                        c = tok_nextc(tok);
1488
0
                    }
1489
0
                    if (!isxdigit(c)) {
1490
0
                        tok_backup(tok, c);
1491
0
                        return syntaxerror(tok, "invalid hexadecimal literal");
1492
0
                    }
1493
0
                    do {
1494
0
                        c = tok_nextc(tok);
1495
0
                    } while (isxdigit(c));
1496
0
                } while (c == '_');
1497
0
            }
1498
14
            else if (c == 'o' || c == 'O') {
1499
                /* Octal */
1500
0
                c = tok_nextc(tok);
1501
0
                do {
1502
0
                    if (c == '_') {
1503
0
                        c = tok_nextc(tok);
1504
0
                    }
1505
0
                    if (c < '0' || c >= '8') {
1506
0
                        tok_backup(tok, c);
1507
0
                        if (isdigit(c)) {
1508
0
                            return syntaxerror(tok,
1509
0
                                    "invalid digit '%c' in octal literal", c);
1510
0
                        }
1511
0
                        else {
1512
0
                            return syntaxerror(tok, "invalid octal literal");
1513
0
                        }
1514
0
                    }
1515
0
                    do {
1516
0
                        c = tok_nextc(tok);
1517
0
                    } while ('0' <= c && c < '8');
1518
0
                } while (c == '_');
1519
0
                if (isdigit(c)) {
1520
0
                    return syntaxerror(tok,
1521
0
                            "invalid digit '%c' in octal literal", c);
1522
0
                }
1523
0
            }
1524
14
            else if (c == 'b' || c == 'B') {
1525
                /* Binary */
1526
0
                c = tok_nextc(tok);
1527
0
                do {
1528
0
                    if (c == '_') {
1529
0
                        c = tok_nextc(tok);
1530
0
                    }
1531
0
                    if (c != '0' && c != '1') {
1532
0
                        tok_backup(tok, c);
1533
0
                        if (isdigit(c)) {
1534
0
                            return syntaxerror(tok,
1535
0
                                    "invalid digit '%c' in binary literal", c);
1536
0
                        }
1537
0
                        else {
1538
0
                            return syntaxerror(tok, "invalid binary literal");
1539
0
                        }
1540
0
                    }
1541
0
                    do {
1542
0
                        c = tok_nextc(tok);
1543
0
                    } while (c == '0' || c == '1');
1544
0
                } while (c == '_');
1545
0
                if (isdigit(c)) {
1546
0
                    return syntaxerror(tok,
1547
0
                            "invalid digit '%c' in binary literal", c);
1548
0
                }
1549
0
            }
1550
14
            else {
1551
14
                int nonzero = 0;
1552
                /* maybe old-style octal; c is first char of it */
1553
                /* in any case, allow '0' as a literal */
1554
14
                while (1) {
1555
14
                    if (c == '_') {
1556
0
                        c = tok_nextc(tok);
1557
0
                        if (!isdigit(c)) {
1558
0
                            tok_backup(tok, c);
1559
0
                            return syntaxerror(tok, "invalid decimal literal");
1560
0
                        }
1561
0
                    }
1562
14
                    if (c != '0') {
1563
14
                        break;
1564
14
                    }
1565
0
                    c = tok_nextc(tok);
1566
0
                }
1567
14
                if (isdigit(c)) {
1568
0
                    nonzero = 1;
1569
0
                    c = tok_decimal_tail(tok);
1570
0
                    if (c == 0) {
1571
0
                        return ERRORTOKEN;
1572
0
                    }
1573
0
                }
1574
14
                if (c == '.') {
1575
2
                    c = tok_nextc(tok);
1576
2
                    goto fraction;
1577
2
                }
1578
12
                else if (c == 'e' || c == 'E') {
1579
0
                    goto exponent;
1580
0
                }
1581
12
                else if (c == 'j' || c == 'J') {
1582
0
                    goto imaginary;
1583
0
                }
1584
12
                else if (nonzero) {
1585
                    /* Old-style octal: now disallowed. */
1586
0
                    tok_backup(tok, c);
1587
0
                    return syntaxerror(tok,
1588
0
                                       "leading zeros in decimal integer "
1589
0
                                       "literals are not permitted; "
1590
0
                                       "use an 0o prefix for octal integers");
1591
0
                }
1592
14
            }
1593
14
        }
1594
16
        else {
1595
            /* Decimal */
1596
16
            c = tok_decimal_tail(tok);
1597
16
            if (c == 0) {
1598
0
                return ERRORTOKEN;
1599
0
            }
1600
16
            {
1601
                /* Accept floating point numbers. */
1602
16
                if (c == '.') {
1603
0
                    c = tok_nextc(tok);
1604
2
        fraction:
1605
                    /* Fraction */
1606
2
                    if (isdigit(c)) {
1607
2
                        c = tok_decimal_tail(tok);
1608
2
                        if (c == 0) {
1609
0
                            return ERRORTOKEN;
1610
0
                        }
1611
2
                    }
1612
2
                }
1613
18
                if (c == 'e' || c == 'E') {
1614
0
                    int e;
1615
0
                  exponent:
1616
0
                    e = c;
1617
                    /* Exponent part */
1618
0
                    c = tok_nextc(tok);
1619
0
                    if (c == '+' || c == '-') {
1620
0
                        c = tok_nextc(tok);
1621
0
                        if (!isdigit(c)) {
1622
0
                            tok_backup(tok, c);
1623
0
                            return syntaxerror(tok, "invalid decimal literal");
1624
0
                        }
1625
0
                    } else if (!isdigit(c)) {
1626
0
                        tok_backup(tok, c);
1627
0
                        tok_backup(tok, e);
1628
0
                        *p_start = tok->start;
1629
0
                        *p_end = tok->cur;
1630
0
                        return NUMBER;
1631
0
                    }
1632
0
                    c = tok_decimal_tail(tok);
1633
0
                    if (c == 0) {
1634
0
                        return ERRORTOKEN;
1635
0
                    }
1636
0
                }
1637
18
                if (c == 'j' || c == 'J') {
1638
                    /* Imaginary part */
1639
0
        imaginary:
1640
0
                    c = tok_nextc(tok);
1641
0
                }
1642
18
            }
1643
18
        }
1644
30
        tok_backup(tok, c);
1645
30
        *p_start = tok->start;
1646
30
        *p_end = tok->cur;
1647
30
        return NUMBER;
1648
30
    }
1649
1650
490
  letter_quote:
1651
    /* String */
1652
490
    if (c == '\'' || c == '"') {
1653
24
        int quote = c;
1654
24
        int quote_size = 1;             /* 1 or 3 */
1655
24
        int end_quote_size = 0;
1656
1657
        /* Nodes of type STRING, especially multi line strings
1658
           must be handled differently in order to get both
1659
           the starting line number and the column offset right.
1660
           (cf. issue 16806) */
1661
24
        tok->first_lineno = tok->lineno;
1662
24
        tok->multi_line_start = tok->line_start;
1663
1664
        /* Find the quote size and start of string */
1665
24
        c = tok_nextc(tok);
1666
24
        if (c == quote) {
1667
0
            c = tok_nextc(tok);
1668
0
            if (c == quote) {
1669
0
                quote_size = 3;
1670
0
            }
1671
0
            else {
1672
0
                end_quote_size = 1;     /* empty string found */
1673
0
            }
1674
0
        }
1675
24
        if (c != quote) {
1676
24
            tok_backup(tok, c);
1677
24
        }
1678
1679
        /* Get rest of string */
1680
342
        while (end_quote_size != quote_size) {
1681
318
            c = tok_nextc(tok);
1682
318
            if (c == EOF) {
1683
0
                if (quote_size == 3) {
1684
0
                    tok->done = E_EOFS;
1685
0
                }
1686
0
                else {
1687
0
                    tok->done = E_EOLS;
1688
0
                }
1689
0
                tok->cur = tok->inp;
1690
0
                return ERRORTOKEN;
1691
0
            }
1692
318
            if (quote_size == 1 && c == '\n') {
1693
0
                tok->done = E_EOLS;
1694
0
                tok->cur = tok->inp;
1695
0
                return ERRORTOKEN;
1696
0
            }
1697
318
            if (c == quote) {
1698
24
                end_quote_size += 1;
1699
24
            }
1700
294
            else {
1701
294
                end_quote_size = 0;
1702
294
                if (c == '\\') {
1703
6
                    tok_nextc(tok);  /* skip escaped char */
1704
6
                }
1705
294
            }
1706
318
        }
1707
1708
24
        *p_start = tok->start;
1709
24
        *p_end = tok->cur;
1710
24
        return STRING;
1711
24
    }
1712
1713
    /* Line continuation */
1714
466
    if (c == '\\') {
1715
0
        c = tok_nextc(tok);
1716
0
        if (c != '\n') {
1717
0
            tok->done = E_LINECONT;
1718
0
            tok->cur = tok->inp;
1719
0
            return ERRORTOKEN;
1720
0
        }
1721
0
        c = tok_nextc(tok);
1722
0
        if (c == EOF) {
1723
0
            tok->done = E_EOF;
1724
0
            tok->cur = tok->inp;
1725
0
            return ERRORTOKEN;
1726
0
        } else {
1727
0
            tok_backup(tok, c);
1728
0
        }
1729
0
        tok->cont_line = 1;
1730
0
        goto again; /* Read next line */
1731
0
    }
1732
1733
    /* Check for two-character token */
1734
466
    {
1735
466
        int c2 = tok_nextc(tok);
1736
466
        int token = PyToken_TwoChars(c, c2);
1737
466
        if (token != OP) {
1738
14
            int c3 = tok_nextc(tok);
1739
14
            int token3 = PyToken_ThreeChars(c, c2, c3);
1740
14
            if (token3 != OP) {
1741
0
                token = token3;
1742
0
            }
1743
14
            else {
1744
14
                tok_backup(tok, c3);
1745
14
            }
1746
14
            *p_start = tok->start;
1747
14
            *p_end = tok->cur;
1748
14
            return token;
1749
14
        }
1750
452
        tok_backup(tok, c2);
1751
452
    }
1752
1753
    /* Keep track of parentheses nesting level */
1754
0
    switch (c) {
1755
98
    case '(':
1756
116
    case '[':
1757
122
    case '{':
1758
122
        if (tok->level >= MAXLEVEL) {
1759
0
            return syntaxerror(tok, "too many nested parentheses");
1760
0
        }
1761
122
        tok->parenstack[tok->level] = c;
1762
122
        tok->parenlinenostack[tok->level] = tok->lineno;
1763
122
        tok->level++;
1764
122
        break;
1765
98
    case ')':
1766
116
    case ']':
1767
122
    case '}':
1768
122
        if (!tok->level) {
1769
0
            return syntaxerror(tok, "unmatched '%c'", c);
1770
0
        }
1771
122
        tok->level--;
1772
122
        int opening = tok->parenstack[tok->level];
1773
122
        if (!((opening == '(' && c == ')') ||
1774
122
              (opening == '[' && c == ']') ||
1775
122
              (opening == '{' && c == '}')))
1776
0
        {
1777
0
            if (tok->parenlinenostack[tok->level] != tok->lineno) {
1778
0
                return syntaxerror(tok,
1779
0
                        "closing parenthesis '%c' does not match "
1780
0
                        "opening parenthesis '%c' on line %d",
1781
0
                        c, opening, tok->parenlinenostack[tok->level]);
1782
0
            }
1783
0
            else {
1784
0
                return syntaxerror(tok,
1785
0
                        "closing parenthesis '%c' does not match "
1786
0
                        "opening parenthesis '%c'",
1787
0
                        c, opening);
1788
0
            }
1789
0
        }
1790
122
        break;
1791
452
    }
1792
1793
    /* Punctuation character */
1794
452
    *p_start = tok->start;
1795
452
    *p_end = tok->cur;
1796
452
    return PyToken_OneChar(c);
1797
452
}
1798
1799
int
1800
PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1801
1.41k
{
1802
1.41k
    int result = tok_get(tok, p_start, p_end);
1803
1.41k
    if (tok->decoding_erred) {
1804
0
        result = ERRORTOKEN;
1805
0
        tok->done = E_DECODE;
1806
0
    }
1807
1.41k
    return result;
1808
1.41k
}
1809
1810
/* Get the encoding of a Python file. Check for the coding cookie and check if
1811
   the file starts with a BOM.
1812
1813
   PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1814
   encoding in the first or second line of the file (in which case the encoding
1815
   should be assumed to be UTF-8).
1816
1817
   The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1818
   by the caller. */
1819
1820
char *
1821
PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
1822
0
{
1823
0
    struct tok_state *tok;
1824
0
    FILE *fp;
1825
0
    char *p_start =NULL , *p_end =NULL , *encoding = NULL;
1826
1827
0
    fd = _Py_dup(fd);
1828
0
    if (fd < 0) {
1829
0
        return NULL;
1830
0
    }
1831
1832
0
    fp = fdopen(fd, "r");
1833
0
    if (fp == NULL) {
1834
0
        return NULL;
1835
0
    }
1836
0
    tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1837
0
    if (tok == NULL) {
1838
0
        fclose(fp);
1839
0
        return NULL;
1840
0
    }
1841
0
    if (filename != NULL) {
1842
0
        Py_INCREF(filename);
1843
0
        tok->filename = filename;
1844
0
    }
1845
0
    else {
1846
0
        tok->filename = PyUnicode_FromString("<string>");
1847
0
        if (tok->filename == NULL) {
1848
0
            fclose(fp);
1849
0
            PyTokenizer_Free(tok);
1850
0
            return encoding;
1851
0
        }
1852
0
    }
1853
0
    while (tok->lineno < 2 && tok->done == E_OK) {
1854
0
        PyTokenizer_Get(tok, &p_start, &p_end);
1855
0
    }
1856
0
    fclose(fp);
1857
0
    if (tok->encoding) {
1858
0
        encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1859
0
        if (encoding)
1860
0
            strcpy(encoding, tok->encoding);
1861
0
    }
1862
0
    PyTokenizer_Free(tok);
1863
0
    return encoding;
1864
0
}
1865
1866
char *
1867
PyTokenizer_FindEncoding(int fd)
1868
0
{
1869
0
    return PyTokenizer_FindEncodingFilename(fd, NULL);
1870
0
}
1871
1872
#ifdef Py_DEBUG
1873
1874
void
1875
tok_dump(int type, char *start, char *end)
1876
{
1877
    printf("%s", _PyParser_TokenNames[type]);
1878
    if (type == NAME || type == NUMBER || type == STRING || type == OP)
1879
        printf("(%.*s)", (int)(end - start), start);
1880
}
1881
1882
#endif