Coverage Report

Created: 2026-04-12 06:54

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Parser/tokenizer/string_tokenizer.c
Line
Count
Source
1
#include "Python.h"
2
#include "errcode.h"
3
4
#include "helpers.h"
5
#include "../lexer/state.h"
6
7
static int
8
193k
tok_underflow_string(struct tok_state *tok) {
9
193k
    char *end = strchr(tok->inp, '\n');
10
193k
    if (end != NULL) {
11
178k
        end++;
12
178k
    }
13
14.6k
    else {
14
14.6k
        end = strchr(tok->inp, '\0');
15
14.6k
        if (end == tok->inp) {
16
14.6k
            tok->done = E_EOF;
17
14.6k
            return 0;
18
14.6k
        }
19
14.6k
    }
20
178k
    if (tok->start == NULL) {
21
170k
        tok->buf = tok->cur;
22
170k
    }
23
178k
    tok->line_start = tok->cur;
24
178k
    ADVANCE_LINENO();
25
178k
    tok->inp = end;
26
178k
    return 1;
27
193k
}
28
29
/* Fetch a byte from TOK, using the string buffer. */
30
static int
31
19.6k
buf_getc(struct tok_state *tok) {
32
19.6k
    return Py_CHARMASK(*tok->str++);
33
19.6k
}
34
35
/* Unfetch a byte from TOK, using the string buffer. */
36
static void
37
19.6k
buf_ungetc(int c, struct tok_state *tok) {
38
19.6k
    tok->str--;
39
19.6k
    assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
40
19.6k
}
41
42
/* Set the readline function for TOK to ENC. For the string-based
43
   tokenizer, this means to just record the encoding. */
44
static int
45
3.84k
buf_setreadl(struct tok_state *tok, const char* enc) {
46
3.84k
    tok->enc = enc;
47
3.84k
    return 1;
48
3.84k
}
49
50
/* Decode a byte string STR for use as the buffer of TOK.
51
   Look for encoding declarations inside STR, and record them
52
   inside TOK.  */
53
static char *
54
decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
55
19.5k
{
56
19.5k
    PyObject* utf8 = NULL;
57
19.5k
    char *str;
58
19.5k
    const char *s;
59
19.5k
    const char *newl[2] = {NULL, NULL};
60
19.5k
    int lineno = 0;
61
19.5k
    tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok);
62
19.5k
    if (str == NULL)
63
0
        return NULL;
64
19.5k
    tok->enc = NULL;
65
19.5k
    tok->str = str;
66
19.5k
    if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
67
0
        return _PyTokenizer_error_ret(tok);
68
19.5k
    str = tok->str;             /* string after BOM if any */
69
19.5k
    assert(str);
70
19.5k
    if (tok->enc != NULL) {
71
0
        utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
72
0
        if (utf8 == NULL)
73
0
            return _PyTokenizer_error_ret(tok);
74
0
        str = PyBytes_AsString(utf8);
75
0
    }
76
2.21M
    for (s = str;; s++) {
77
2.21M
        if (*s == '\0') break;
78
2.20M
        else if (*s == '\n') {
79
24.5k
            assert(lineno < 2);
80
24.5k
            newl[lineno] = s;
81
24.5k
            lineno++;
82
24.5k
            if (lineno == 2) break;
83
24.5k
        }
84
2.21M
    }
85
19.5k
    tok->enc = NULL;
86
    /* need to check line 1 and 2 separately since check_coding_spec
87
       assumes a single line as input */
88
19.5k
    if (newl[0]) {
89
19.5k
        tok->lineno = 1;
90
19.5k
        if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
91
1
            return NULL;
92
1
        }
93
19.5k
        if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
94
557
            tok->lineno = 2;
95
557
            if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
96
557
                                   tok, buf_setreadl))
97
0
                return NULL;
98
557
        }
99
19.5k
    }
100
19.5k
    tok->lineno = 0;
101
19.5k
    if (tok->enc != NULL) {
102
3.84k
        assert(utf8 == NULL);
103
3.84k
        utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
104
3.84k
        if (utf8 == NULL)
105
1.83k
            return _PyTokenizer_error_ret(tok);
106
2.00k
        str = PyBytes_AS_STRING(utf8);
107
2.00k
    }
108
15.7k
    else if (!_PyTokenizer_ensure_utf8(str, tok, 1)) {
109
577
        return _PyTokenizer_error_ret(tok);
110
577
    }
111
17.1k
    if (utf8 != NULL) {
112
2.00k
        char *translated = _PyTokenizer_translate_newlines(
113
2.00k
            str, single, preserve_crlf, tok);
114
2.00k
        if (translated == NULL) {
115
0
            Py_DECREF(utf8);
116
0
            return _PyTokenizer_error_ret(tok);
117
0
        }
118
2.00k
        PyMem_Free(tok->input);
119
2.00k
        tok->input = translated;
120
2.00k
        str = translated;
121
2.00k
        Py_CLEAR(utf8);
122
2.00k
    }
123
17.1k
    tok->str = str;
124
17.1k
    assert(tok->decoding_buffer == NULL);
125
17.1k
    tok->decoding_buffer = utf8; /* CAUTION */
126
17.1k
    return str;
127
17.1k
}
128
129
/* Set up tokenizer for string */
130
struct tok_state *
131
_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
132
19.5k
{
133
19.5k
    struct tok_state *tok = _PyTokenizer_tok_new();
134
19.5k
    char *decoded;
135
136
19.5k
    if (tok == NULL)
137
0
        return NULL;
138
19.5k
    decoded = decode_str(str, exec_input, tok, preserve_crlf);
139
19.5k
    if (decoded == NULL) {
140
2.41k
        _PyTokenizer_Free(tok);
141
2.41k
        return NULL;
142
2.41k
    }
143
144
17.1k
    tok->buf = tok->cur = tok->inp = decoded;
145
17.1k
    tok->end = decoded;
146
17.1k
    tok->underflow = &tok_underflow_string;
147
17.1k
    return tok;
148
19.5k
}