Coverage Report

Created: 2025-07-18 06:10

/src/cpython3/Parser/string_parser.c
Line
Count
Source (jump to first uncovered line)
1
#include <Python.h>
2
#include "pycore_bytesobject.h"   // _PyBytes_DecodeEscape()
3
#include "pycore_unicodeobject.h" // _PyUnicode_DecodeUnicodeEscapeInternal()
4
5
#include "lexer/state.h"
6
#include "pegen.h"
7
#include "string_parser.h"
8
9
#include <stdbool.h>
10
11
//// STRING HANDLING FUNCTIONS ////
12
13
static int
14
warn_invalid_escape_sequence(Parser *p, const char* buffer, const char *first_invalid_escape, Token *t)
15
14.5k
{
16
14.5k
    if (p->call_invalid_rules) {
17
        // Do not report warnings if we are in the second pass of the parser
18
        // to avoid showing the warning twice.
19
3.55k
        return 0;
20
3.55k
    }
21
10.9k
    unsigned char c = (unsigned char)*first_invalid_escape;
22
10.9k
    if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END || t->type == TSTRING_MIDDLE || t->type == TSTRING_END)
23
10.9k
            && (c == '{' || c == '}')) {
24
        // in this case the tokenizer has already emitted a warning,
25
        // see Parser/tokenizer/helpers.c:warn_invalid_escape_sequence
26
1.77k
        return 0;
27
1.77k
    }
28
29
9.18k
    int octal = ('4' <= c && c <= '7');
30
9.18k
    PyObject *msg =
31
9.18k
        octal
32
9.18k
        ? PyUnicode_FromFormat(
33
1.04k
              "\"\\%.3s\" is an invalid octal escape sequence. "
34
1.04k
              "Such sequences will not work in the future. "
35
1.04k
              "Did you mean \"\\\\%.3s\"? A raw string is also an option.",
36
1.04k
              first_invalid_escape, first_invalid_escape)
37
9.18k
        : PyUnicode_FromFormat(
38
8.13k
              "\"\\%c\" is an invalid escape sequence. "
39
8.13k
              "Such sequences will not work in the future. "
40
8.13k
              "Did you mean \"\\\\%c\"? A raw string is also an option.",
41
8.13k
              c, c);
42
9.18k
    if (msg == NULL) {
43
0
        return -1;
44
0
    }
45
9.18k
    PyObject *category;
46
9.18k
    if (p->feature_version >= 12) {
47
9.18k
        category = PyExc_SyntaxWarning;
48
9.18k
    }
49
0
    else {
50
0
        category = PyExc_DeprecationWarning;
51
0
    }
52
53
    // Calculate the lineno and the col_offset of the invalid escape sequence
54
9.18k
    const char *start = buffer;
55
9.18k
    const char *end = first_invalid_escape;
56
9.18k
    int lineno = t->lineno;
57
9.18k
    int col_offset = t->col_offset;
58
658k
    while (start < end) {
59
649k
        if (*start == '\n') {
60
4.65k
            lineno++;
61
4.65k
            col_offset = 0;
62
4.65k
        }
63
645k
        else {
64
645k
            col_offset++;
65
645k
        }
66
649k
        start++;
67
649k
    }
68
69
    // Count the number of quotes in the token
70
9.18k
    char first_quote = 0;
71
9.18k
    if (lineno == t->lineno) {
72
8.96k
        int quote_count = 0;
73
8.96k
        char* tok = PyBytes_AsString(t->bytes);
74
16.4k
        for (int i = 0; i < PyBytes_Size(t->bytes); i++) {
75
15.2k
            if (tok[i] == '\'' || tok[i] == '\"') {
76
7.44k
                if (quote_count == 0) {
77
4.98k
                    first_quote = tok[i];
78
4.98k
                }
79
7.44k
                if (tok[i] == first_quote) {
80
7.21k
                    quote_count++;
81
7.21k
                }
82
7.82k
            } else {
83
7.82k
                break;
84
7.82k
            }
85
15.2k
        }
86
87
8.96k
        col_offset += quote_count;
88
8.96k
    }
89
90
9.18k
    if (PyErr_WarnExplicitObject(category, msg, p->tok->filename,
91
9.18k
                                 lineno, NULL, NULL) < 0) {
92
0
        if (PyErr_ExceptionMatches(category)) {
93
            /* Replace the Syntax/DeprecationWarning exception with a SyntaxError
94
               to get a more accurate error report */
95
0
            PyErr_Clear();
96
97
            /* This is needed, in order for the SyntaxError to point to the token t,
98
               since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
99
               error location, if p->known_err_token is not set. */
100
0
            p->known_err_token = t;
101
0
            if (octal) {
102
0
                RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, col_offset-1, lineno, col_offset+1,
103
0
                    "\"\\%.3s\" is an invalid octal escape sequence. "
104
0
                    "Did you mean \"\\\\%.3s\"? A raw string is also an option.",
105
0
                    first_invalid_escape, first_invalid_escape);
106
0
            }
107
0
            else {
108
0
                RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, col_offset-1, lineno, col_offset+1,
109
0
                    "\"\\%c\" is an invalid escape sequence. "
110
0
                    "Did you mean \"\\\\%c\"? A raw string is also an option.",
111
0
                    c, c);
112
0
            }
113
0
        }
114
0
        Py_DECREF(msg);
115
0
        return -1;
116
0
    }
117
9.18k
    Py_DECREF(msg);
118
9.18k
    return 0;
119
9.18k
}
120
121
static PyObject *
122
decode_utf8(const char **sPtr, const char *end)
123
85.4k
{
124
85.4k
    const char *s;
125
85.4k
    const char *t;
126
85.4k
    t = s = *sPtr;
127
1.53M
    while (s < end && (*s & 0x80)) {
128
1.45M
        s++;
129
1.45M
    }
130
85.4k
    *sPtr = s;
131
85.4k
    return PyUnicode_DecodeUTF8(t, s - t, NULL);
132
85.4k
}
133
134
static PyObject *
135
decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
136
34.7k
{
137
34.7k
    PyObject *v;
138
34.7k
    PyObject *u;
139
34.7k
    char *buf;
140
34.7k
    char *p;
141
34.7k
    const char *end;
142
143
    /* check for integer overflow */
144
34.7k
    if (len > (size_t)PY_SSIZE_T_MAX / 6) {
145
0
        return NULL;
146
0
    }
147
    /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
148
       "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
149
34.7k
    u = PyBytes_FromStringAndSize((char *)NULL, (Py_ssize_t)len * 6);
150
34.7k
    if (u == NULL) {
151
0
        return NULL;
152
0
    }
153
34.7k
    p = buf = PyBytes_AsString(u);
154
34.7k
    if (p == NULL) {
155
0
        return NULL;
156
0
    }
157
34.7k
    end = s + len;
158
6.28M
    while (s < end) {
159
6.25M
        if (*s == '\\') {
160
549k
            *p++ = *s++;
161
549k
            if (s >= end || *s & 0x80) {
162
8.18k
                strcpy(p, "u005c");
163
8.18k
                p += 5;
164
8.18k
                if (s >= end) {
165
1.18k
                    break;
166
1.18k
                }
167
8.18k
            }
168
549k
        }
169
6.25M
        if (*s & 0x80) {
170
85.4k
            PyObject *w;
171
85.4k
            int kind;
172
85.4k
            const void *data;
173
85.4k
            Py_ssize_t w_len;
174
85.4k
            Py_ssize_t i;
175
85.4k
            w = decode_utf8(&s, end);
176
85.4k
            if (w == NULL) {
177
27
                Py_DECREF(u);
178
27
                return NULL;
179
27
            }
180
85.4k
            kind = PyUnicode_KIND(w);
181
85.4k
            data = PyUnicode_DATA(w);
182
85.4k
            w_len = PyUnicode_GET_LENGTH(w);
183
644k
            for (i = 0; i < w_len; i++) {
184
559k
                Py_UCS4 chr = PyUnicode_READ(kind, data, i);
185
559k
                sprintf(p, "\\U%08x", chr);
186
559k
                p += 10;
187
559k
            }
188
            /* Should be impossible to overflow */
189
85.4k
            assert(p - buf <= PyBytes_GET_SIZE(u));
190
85.4k
            Py_DECREF(w);
191
85.4k
        }
192
6.16M
        else {
193
6.16M
            *p++ = *s++;
194
6.16M
        }
195
6.25M
    }
196
34.6k
    len = (size_t)(p - buf);
197
34.6k
    s = buf;
198
199
34.6k
    int first_invalid_escape_char;
200
34.6k
    const char *first_invalid_escape_ptr;
201
34.6k
    v = _PyUnicode_DecodeUnicodeEscapeInternal2(s, (Py_ssize_t)len, NULL, NULL,
202
34.6k
                                                &first_invalid_escape_char,
203
34.6k
                                                &first_invalid_escape_ptr);
204
205
    // HACK: later we can simply pass the line no, since we don't preserve the tokens
206
    // when we are decoding the string but we preserve the line numbers.
207
34.6k
    if (v != NULL && first_invalid_escape_ptr != NULL && t != NULL) {
208
11.2k
        if (warn_invalid_escape_sequence(parser, s, first_invalid_escape_ptr, t) < 0) {
209
            /* We have not decref u before because first_invalid_escape_ptr
210
               points inside u. */
211
0
            Py_XDECREF(u);
212
0
            Py_DECREF(v);
213
0
            return NULL;
214
0
        }
215
11.2k
    }
216
34.6k
    Py_XDECREF(u);
217
34.6k
    return v;
218
34.6k
}
219
220
static PyObject *
221
decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
222
4.55k
{
223
4.55k
    int first_invalid_escape_char;
224
4.55k
    const char *first_invalid_escape_ptr;
225
4.55k
    PyObject *result = _PyBytes_DecodeEscape2(s, len, NULL,
226
4.55k
                                              &first_invalid_escape_char,
227
4.55k
                                              &first_invalid_escape_ptr);
228
4.55k
    if (result == NULL) {
229
5
        return NULL;
230
5
    }
231
232
4.54k
    if (first_invalid_escape_ptr != NULL) {
233
3.29k
        if (warn_invalid_escape_sequence(p, s, first_invalid_escape_ptr, t) < 0) {
234
0
            Py_DECREF(result);
235
0
            return NULL;
236
0
        }
237
3.29k
    }
238
4.54k
    return result;
239
4.54k
}
240
241
PyObject *
242
_PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t)
243
120k
{
244
120k
    if (raw) {
245
85.8k
        return PyUnicode_DecodeUTF8Stateful(s, (Py_ssize_t)len, NULL, NULL);
246
85.8k
    }
247
34.7k
    return decode_unicode_with_escapes(p, s, len, t);
248
120k
}
249
250
/* s must include the bracketing quote characters, and r, b &/or f prefixes
251
    (if any), and embedded escape sequences (if any). (f-strings are handled by the parser)
252
   _PyPegen_parse_string parses it, and returns the decoded Python string object. */
253
PyObject *
254
_PyPegen_parse_string(Parser *p, Token *t)
255
87.5k
{
256
87.5k
    const char *s = PyBytes_AsString(t->bytes);
257
87.5k
    if (s == NULL) {
258
0
        return NULL;
259
0
    }
260
261
87.5k
    size_t len;
262
87.5k
    int quote = Py_CHARMASK(*s);
263
87.5k
    int bytesmode = 0;
264
87.5k
    int rawmode = 0;
265
266
87.5k
    if (Py_ISALPHA(quote)) {
267
68.7k
        while (!bytesmode || !rawmode) {
268
60.1k
            if (quote == 'b' || quote == 'B') {
269
26.0k
                quote =(unsigned char)*++s;
270
26.0k
                bytesmode = 1;
271
26.0k
            }
272
34.1k
            else if (quote == 'u' || quote == 'U') {
273
1.82k
                quote = (unsigned char)*++s;
274
1.82k
            }
275
32.3k
            else if (quote == 'r' || quote == 'R') {
276
10.8k
                quote = (unsigned char)*++s;
277
10.8k
                rawmode = 1;
278
10.8k
            }
279
21.4k
            else {
280
21.4k
                break;
281
21.4k
            }
282
60.1k
        }
283
30.0k
    }
284
285
87.5k
    if (quote != '\'' && quote != '\"') {
286
0
        PyErr_BadInternalCall();
287
0
        return NULL;
288
0
    }
289
290
    /* Skip the leading quote char. */
291
87.5k
    s++;
292
87.5k
    len = strlen(s);
293
    // gh-120155: 's' contains at least the trailing quote,
294
    // so the code '--len' below is safe.
295
87.5k
    assert(len >= 1);
296
297
87.5k
    if (len > INT_MAX) {
298
0
        PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
299
0
        return NULL;
300
0
    }
301
87.5k
    if (s[--len] != quote) {
302
        /* Last quote char must match the first. */
303
0
        PyErr_BadInternalCall();
304
0
        return NULL;
305
0
    }
306
87.5k
    if (len >= 4 && s[0] == quote && s[1] == quote) {
307
        /* A triple quoted string. We've already skipped one quote at
308
           the start and one at the end of the string. Now skip the
309
           two at the start. */
310
8.11k
        s += 2;
311
8.11k
        len -= 2;
312
        /* And check that the last two match. */
313
8.11k
        if (s[--len] != quote || s[--len] != quote) {
314
0
            PyErr_BadInternalCall();
315
0
            return NULL;
316
0
        }
317
8.11k
    }
318
319
    /* Avoid invoking escape decoding routines if possible. */
320
87.5k
    rawmode = rawmode || strchr(s, '\\') == NULL;
321
87.5k
    if (bytesmode) {
322
        /* Disallow non-ASCII characters. */
323
26.0k
        const char *ch;
324
4.53M
        for (ch = s; *ch; ch++) {
325
4.50M
            if (Py_CHARMASK(*ch) >= 0x80) {
326
17
                RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
327
17
                                   t,
328
17
                                   "bytes can only contain ASCII "
329
17
                                   "literal characters");
330
17
                return NULL;
331
17
            }
332
4.50M
        }
333
26.0k
        if (rawmode) {
334
21.4k
            return PyBytes_FromStringAndSize(s, (Py_ssize_t)len);
335
21.4k
        }
336
4.55k
        return decode_bytes_with_escapes(p, s, (Py_ssize_t)len, t);
337
26.0k
    }
338
61.5k
    return _PyPegen_decode_string(p, rawmode, s, len, t);
339
87.5k
}