Coverage Report

Created: 2026-03-19 06:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython3/Parser/tokenizer/string_tokenizer.c
Line
Count
Source
1
#include "Python.h"
2
#include "errcode.h"
3
4
#include "helpers.h"
5
#include "../lexer/state.h"
6
7
static int
8
122k
tok_underflow_string(struct tok_state *tok) {
9
122k
    char *end = strchr(tok->inp, '\n');
10
122k
    if (end != NULL) {
11
113k
        end++;
12
113k
    }
13
9.47k
    else {
14
9.47k
        end = strchr(tok->inp, '\0');
15
9.47k
        if (end == tok->inp) {
16
6.32k
            tok->done = E_EOF;
17
6.32k
            return 0;
18
6.32k
        }
19
9.47k
    }
20
116k
    if (tok->start == NULL) {
21
110k
        tok->buf = tok->cur;
22
110k
    }
23
116k
    tok->line_start = tok->cur;
24
116k
    ADVANCE_LINENO();
25
116k
    tok->inp = end;
26
116k
    return 1;
27
122k
}
28
29
/* Fetch a byte from TOK, using the string buffer. */
30
static int
31
8.80k
buf_getc(struct tok_state *tok) {
32
8.80k
    return Py_CHARMASK(*tok->str++);
33
8.80k
}
34
35
/* Unfetch a byte from TOK, using the string buffer. */
36
static void
37
8.80k
buf_ungetc(int c, struct tok_state *tok) {
38
8.80k
    tok->str--;
39
8.80k
    assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
40
8.80k
}
41
42
/* Set the readline function for TOK to ENC. For the string-based
43
   tokenizer, this means to just record the encoding. */
44
static int
45
2.37k
buf_setreadl(struct tok_state *tok, const char* enc) {
46
2.37k
    tok->enc = enc;
47
2.37k
    return 1;
48
2.37k
}
49
50
/* Decode a byte string STR for use as the buffer of TOK.
51
   Look for encoding declarations inside STR, and record them
52
   inside TOK.  */
53
static char *
54
decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
55
8.79k
{
56
8.79k
    PyObject* utf8 = NULL;
57
8.79k
    char *str;
58
8.79k
    const char *s;
59
8.79k
    const char *newl[2] = {NULL, NULL};
60
8.79k
    int lineno = 0;
61
8.79k
    tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok);
62
8.79k
    if (str == NULL)
63
0
        return NULL;
64
8.79k
    tok->enc = NULL;
65
8.79k
    tok->str = str;
66
8.79k
    if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
67
0
        return _PyTokenizer_error_ret(tok);
68
8.79k
    str = tok->str;             /* string after BOM if any */
69
8.79k
    assert(str);
70
8.79k
    if (tok->enc != NULL) {
71
0
        utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
72
0
        if (utf8 == NULL)
73
0
            return _PyTokenizer_error_ret(tok);
74
0
        str = PyBytes_AsString(utf8);
75
0
    }
76
2.67M
    for (s = str;; s++) {
77
2.67M
        if (*s == '\0') break;
78
2.66M
        else if (*s == '\n') {
79
8.38k
            assert(lineno < 2);
80
8.38k
            newl[lineno] = s;
81
8.38k
            lineno++;
82
8.38k
            if (lineno == 2) break;
83
8.38k
        }
84
2.67M
    }
85
8.79k
    tok->enc = NULL;
86
    /* need to check line 1 and 2 separately since check_coding_spec
87
       assumes a single line as input */
88
8.79k
    if (newl[0]) {
89
5.67k
        tok->lineno = 1;
90
5.67k
        if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
91
0
            return NULL;
92
0
        }
93
5.67k
        if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
94
442
            tok->lineno = 2;
95
442
            if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
96
442
                                   tok, buf_setreadl))
97
0
                return NULL;
98
442
        }
99
5.67k
    }
100
8.79k
    tok->lineno = 0;
101
8.79k
    if (tok->enc != NULL) {
102
2.37k
        assert(utf8 == NULL);
103
2.37k
        utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
104
2.37k
        if (utf8 == NULL)
105
914
            return _PyTokenizer_error_ret(tok);
106
1.46k
        str = PyBytes_AS_STRING(utf8);
107
1.46k
    }
108
6.41k
    else if (!_PyTokenizer_ensure_utf8(str, tok, 1)) {
109
338
        return _PyTokenizer_error_ret(tok);
110
338
    }
111
7.53k
    if (utf8 != NULL) {
112
1.46k
        char *translated = _PyTokenizer_translate_newlines(
113
1.46k
            str, single, preserve_crlf, tok);
114
1.46k
        if (translated == NULL) {
115
0
            Py_DECREF(utf8);
116
0
            return _PyTokenizer_error_ret(tok);
117
0
        }
118
1.46k
        PyMem_Free(tok->input);
119
1.46k
        tok->input = translated;
120
1.46k
        str = translated;
121
1.46k
        Py_CLEAR(utf8);
122
1.46k
    }
123
7.53k
    tok->str = str;
124
7.53k
    assert(tok->decoding_buffer == NULL);
125
7.53k
    tok->decoding_buffer = utf8; /* CAUTION */
126
7.53k
    return str;
127
7.53k
}
128
129
/* Set up tokenizer for string */
130
struct tok_state *
131
_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
132
8.79k
{
133
8.79k
    struct tok_state *tok = _PyTokenizer_tok_new();
134
8.79k
    char *decoded;
135
136
8.79k
    if (tok == NULL)
137
0
        return NULL;
138
8.79k
    decoded = decode_str(str, exec_input, tok, preserve_crlf);
139
8.79k
    if (decoded == NULL) {
140
1.25k
        _PyTokenizer_Free(tok);
141
1.25k
        return NULL;
142
1.25k
    }
143
144
7.53k
    tok->buf = tok->cur = tok->inp = decoded;
145
7.53k
    tok->end = decoded;
146
7.53k
    tok->underflow = &tok_underflow_string;
147
7.53k
    return tok;
148
8.79k
}