/src/cpython/Parser/string_parser.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include <Python.h> |
2 | | #include "pycore_bytesobject.h" // _PyBytes_DecodeEscape() |
3 | | #include "pycore_unicodeobject.h" // _PyUnicode_DecodeUnicodeEscapeInternal() |
4 | | |
5 | | #include "lexer/state.h" |
6 | | #include "pegen.h" |
7 | | #include "string_parser.h" |
8 | | |
9 | | #include <stdbool.h> |
10 | | |
11 | | //// STRING HANDLING FUNCTIONS //// |
12 | | |
13 | | static int |
14 | | warn_invalid_escape_sequence(Parser *p, const char* buffer, const char *first_invalid_escape, Token *t) |
15 | 5.20k | { |
16 | 5.20k | if (p->call_invalid_rules) { |
17 | | // Do not report warnings if we are in the second pass of the parser |
18 | | // to avoid showing the warning twice. |
19 | 1.68k | return 0; |
20 | 1.68k | } |
21 | 3.52k | unsigned char c = (unsigned char)*first_invalid_escape; |
22 | 3.52k | if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END || t->type == TSTRING_MIDDLE || t->type == TSTRING_END) |
23 | 3.52k | && (c == '{' || c == '}')) { |
24 | | // in this case the tokenizer has already emitted a warning, |
25 | | // see Parser/tokenizer/helpers.c:warn_invalid_escape_sequence |
26 | 512 | return 0; |
27 | 512 | } |
28 | | |
29 | 3.01k | int octal = ('4' <= c && c <= '7'); |
30 | 3.01k | PyObject *msg = |
31 | 3.01k | octal |
32 | 3.01k | ? PyUnicode_FromFormat( |
33 | 740 | "\"\\%.3s\" is an invalid octal escape sequence. " |
34 | 740 | "Such sequences will not work in the future. " |
35 | 740 | "Did you mean \"\\\\%.3s\"? A raw string is also an option.", |
36 | 740 | first_invalid_escape, first_invalid_escape) |
37 | 3.01k | : PyUnicode_FromFormat( |
38 | 2.27k | "\"\\%c\" is an invalid escape sequence. " |
39 | 2.27k | "Such sequences will not work in the future. " |
40 | 2.27k | "Did you mean \"\\\\%c\"? A raw string is also an option.", |
41 | 2.27k | c, c); |
42 | 3.01k | if (msg == NULL) { |
43 | 0 | return -1; |
44 | 0 | } |
45 | 3.01k | PyObject *category; |
46 | 3.01k | if (p->feature_version >= 12) { |
47 | 3.01k | category = PyExc_SyntaxWarning; |
48 | 3.01k | } |
49 | 0 | else { |
50 | 0 | category = PyExc_DeprecationWarning; |
51 | 0 | } |
52 | | |
53 | | // Calculate the lineno and the col_offset of the invalid escape sequence |
54 | 3.01k | const char *start = buffer; |
55 | 3.01k | const char *end = first_invalid_escape; |
56 | 3.01k | int lineno = t->lineno; |
57 | 3.01k | int col_offset = t->col_offset; |
58 | 30.6k | while (start < end) { |
59 | 27.6k | if (*start == '\n') { |
60 | 907 | lineno++; |
61 | 907 | col_offset = 0; |
62 | 907 | } |
63 | 26.7k | else { |
64 | 26.7k | col_offset++; |
65 | 26.7k | } |
66 | 27.6k | start++; |
67 | 27.6k | } |
68 | | |
69 | | // Count the number of quotes in the token |
70 | 3.01k | char first_quote = 0; |
71 | 3.01k | if (lineno == t->lineno) { |
72 | 2.80k | int quote_count = 0; |
73 | 2.80k | char* tok = PyBytes_AsString(t->bytes); |
74 | 5.34k | for (int i = 0; i < PyBytes_Size(t->bytes); i++) { |
75 | 4.15k | if (tok[i] == '\'' || tok[i] == '\"') { |
76 | 2.53k | if (quote_count == 0) { |
77 | 2.17k | first_quote = tok[i]; |
78 | 2.17k | } |
79 | 2.53k | if (tok[i] == first_quote) { |
80 | 2.28k | quote_count++; |
81 | 2.28k | } |
82 | 2.53k | } else { |
83 | 1.61k | break; |
84 | 1.61k | } |
85 | 4.15k | } |
86 | | |
87 | 2.80k | col_offset += quote_count; |
88 | 2.80k | } |
89 | | |
90 | 3.01k | if (PyErr_WarnExplicitObject(category, msg, p->tok->filename, |
91 | 3.01k | lineno, NULL, NULL) < 0) { |
92 | 0 | if (PyErr_ExceptionMatches(category)) { |
93 | | /* Replace the Syntax/DeprecationWarning exception with a SyntaxError |
94 | | to get a more accurate error report */ |
95 | 0 | PyErr_Clear(); |
96 | | |
97 | | /* This is needed, in order for the SyntaxError to point to the token t, |
98 | | since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the |
99 | | error location, if p->known_err_token is not set. */ |
100 | 0 | p->known_err_token = t; |
101 | 0 | if (octal) { |
102 | 0 | RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, col_offset-1, lineno, col_offset+1, |
103 | 0 | "\"\\%.3s\" is an invalid octal escape sequence. " |
104 | 0 | "Did you mean \"\\\\%.3s\"? A raw string is also an option.", |
105 | 0 | first_invalid_escape, first_invalid_escape); |
106 | 0 | } |
107 | 0 | else { |
108 | 0 | RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, col_offset-1, lineno, col_offset+1, |
109 | 0 | "\"\\%c\" is an invalid escape sequence. " |
110 | 0 | "Did you mean \"\\\\%c\"? A raw string is also an option.", |
111 | 0 | c, c); |
112 | 0 | } |
113 | 0 | } |
114 | 0 | Py_DECREF(msg); |
115 | 0 | return -1; |
116 | 0 | } |
117 | 3.01k | Py_DECREF(msg); |
118 | 3.01k | return 0; |
119 | 3.01k | } |
120 | | |
121 | | static PyObject * |
122 | | decode_utf8(const char **sPtr, const char *end) |
123 | 7.08k | { |
124 | 7.08k | const char *s; |
125 | 7.08k | const char *t; |
126 | 7.08k | t = s = *sPtr; |
127 | 59.8k | while (s < end && (*s & 0x80)) { |
128 | 52.8k | s++; |
129 | 52.8k | } |
130 | 7.08k | *sPtr = s; |
131 | 7.08k | return PyUnicode_DecodeUTF8(t, s - t, NULL); |
132 | 7.08k | } |
133 | | |
134 | | static PyObject * |
135 | | decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t) |
136 | 32.4k | { |
137 | 32.4k | PyObject *v; |
138 | 32.4k | PyObject *u; |
139 | 32.4k | char *buf; |
140 | 32.4k | char *p; |
141 | 32.4k | const char *end; |
142 | | |
143 | | /* check for integer overflow */ |
144 | 32.4k | if (len > (size_t)PY_SSIZE_T_MAX / 6) { |
145 | 0 | return NULL; |
146 | 0 | } |
147 | | /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5 |
148 | | "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */ |
149 | 32.4k | u = PyBytes_FromStringAndSize((char *)NULL, (Py_ssize_t)len * 6); |
150 | 32.4k | if (u == NULL) { |
151 | 0 | return NULL; |
152 | 0 | } |
153 | 32.4k | p = buf = PyBytes_AsString(u); |
154 | 32.4k | if (p == NULL) { |
155 | 0 | return NULL; |
156 | 0 | } |
157 | 32.4k | end = s + len; |
158 | 276k | while (s < end) { |
159 | 244k | if (*s == '\\') { |
160 | 41.6k | *p++ = *s++; |
161 | 41.6k | if (s >= end || *s & 0x80) { |
162 | 1.47k | strcpy(p, "u005c"); |
163 | 1.47k | p += 5; |
164 | 1.47k | if (s >= end) { |
165 | 305 | break; |
166 | 305 | } |
167 | 1.47k | } |
168 | 41.6k | } |
169 | 244k | if (*s & 0x80) { |
170 | 7.08k | PyObject *w; |
171 | 7.08k | int kind; |
172 | 7.08k | const void *data; |
173 | 7.08k | Py_ssize_t w_len; |
174 | 7.08k | Py_ssize_t i; |
175 | 7.08k | w = decode_utf8(&s, end); |
176 | 7.08k | if (w == NULL) { |
177 | 32 | Py_DECREF(u); |
178 | 32 | return NULL; |
179 | 32 | } |
180 | 7.05k | kind = PyUnicode_KIND(w); |
181 | 7.05k | data = PyUnicode_DATA(w); |
182 | 7.05k | w_len = PyUnicode_GET_LENGTH(w); |
183 | 26.4k | for (i = 0; i < w_len; i++) { |
184 | 19.4k | Py_UCS4 chr = PyUnicode_READ(kind, data, i); |
185 | 19.4k | sprintf(p, "\\U%08x", chr); |
186 | 19.4k | p += 10; |
187 | 19.4k | } |
188 | | /* Should be impossible to overflow */ |
189 | 7.05k | assert(p - buf <= PyBytes_GET_SIZE(u)); |
190 | 7.05k | Py_DECREF(w); |
191 | 7.05k | } |
192 | 237k | else { |
193 | 237k | *p++ = *s++; |
194 | 237k | } |
195 | 244k | } |
196 | 32.3k | len = (size_t)(p - buf); |
197 | 32.3k | s = buf; |
198 | | |
199 | 32.3k | int first_invalid_escape_char; |
200 | 32.3k | const char *first_invalid_escape_ptr; |
201 | 32.3k | v = _PyUnicode_DecodeUnicodeEscapeInternal2(s, (Py_ssize_t)len, NULL, NULL, |
202 | 32.3k | &first_invalid_escape_char, |
203 | 32.3k | &first_invalid_escape_ptr); |
204 | | |
205 | | // HACK: later we can simply pass the line no, since we don't preserve the tokens |
206 | | // when we are decoding the string but we preserve the line numbers. |
207 | 32.3k | if (v != NULL && first_invalid_escape_ptr != NULL && t != NULL) { |
208 | 4.52k | if (warn_invalid_escape_sequence(parser, s, first_invalid_escape_ptr, t) < 0) { |
209 | | /* We have not decref u before because first_invalid_escape_ptr |
210 | | points inside u. */ |
211 | 0 | Py_XDECREF(u); |
212 | 0 | Py_DECREF(v); |
213 | 0 | return NULL; |
214 | 0 | } |
215 | 4.52k | } |
216 | 32.3k | Py_XDECREF(u); |
217 | 32.3k | return v; |
218 | 32.3k | } |
219 | | |
220 | | static PyObject * |
221 | | decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t) |
222 | 2.88k | { |
223 | 2.88k | int first_invalid_escape_char; |
224 | 2.88k | const char *first_invalid_escape_ptr; |
225 | 2.88k | PyObject *result = _PyBytes_DecodeEscape2(s, len, NULL, |
226 | 2.88k | &first_invalid_escape_char, |
227 | 2.88k | &first_invalid_escape_ptr); |
228 | 2.88k | if (result == NULL) { |
229 | 5 | return NULL; |
230 | 5 | } |
231 | | |
232 | 2.88k | if (first_invalid_escape_ptr != NULL) { |
233 | 683 | if (warn_invalid_escape_sequence(p, s, first_invalid_escape_ptr, t) < 0) { |
234 | 0 | Py_DECREF(result); |
235 | 0 | return NULL; |
236 | 0 | } |
237 | 683 | } |
238 | 2.88k | return result; |
239 | 2.88k | } |
240 | | |
241 | | PyObject * |
242 | | _PyPegen_decode_string(Parser *p, int raw, const char *s, size_t len, Token *t) |
243 | 93.6k | { |
244 | 93.6k | if (raw) { |
245 | 61.2k | return PyUnicode_DecodeUTF8Stateful(s, (Py_ssize_t)len, NULL, NULL); |
246 | 61.2k | } |
247 | 32.4k | return decode_unicode_with_escapes(p, s, len, t); |
248 | 93.6k | } |
249 | | |
250 | | /* s must include the bracketing quote characters, and r, b &/or f prefixes |
251 | | (if any), and embedded escape sequences (if any). (f-strings are handled by the parser) |
252 | | _PyPegen_parse_string parses it, and returns the decoded Python string object. */ |
253 | | PyObject * |
254 | | _PyPegen_parse_string(Parser *p, Token *t) |
255 | 63.3k | { |
256 | 63.3k | const char *s = PyBytes_AsString(t->bytes); |
257 | 63.3k | if (s == NULL) { |
258 | 0 | return NULL; |
259 | 0 | } |
260 | | |
261 | 63.3k | size_t len; |
262 | 63.3k | int quote = Py_CHARMASK(*s); |
263 | 63.3k | int bytesmode = 0; |
264 | 63.3k | int rawmode = 0; |
265 | | |
266 | 63.3k | if (Py_ISALPHA(quote)) { |
267 | 14.6k | while (!bytesmode || !rawmode) { |
268 | 14.1k | if (quote == 'b' || quote == 'B') { |
269 | 5.78k | quote =(unsigned char)*++s; |
270 | 5.78k | bytesmode = 1; |
271 | 5.78k | } |
272 | 8.41k | else if (quote == 'u' || quote == 'U') { |
273 | 630 | quote = (unsigned char)*++s; |
274 | 630 | } |
275 | 7.78k | else if (quote == 'r' || quote == 'R') { |
276 | 1.09k | quote = (unsigned char)*++s; |
277 | 1.09k | rawmode = 1; |
278 | 1.09k | } |
279 | 6.68k | else { |
280 | 6.68k | break; |
281 | 6.68k | } |
282 | 14.1k | } |
283 | 7.09k | } |
284 | | |
285 | 63.3k | if (quote != '\'' && quote != '\"') { |
286 | 1 | PyErr_BadInternalCall(); |
287 | 1 | return NULL; |
288 | 1 | } |
289 | | |
290 | | /* Skip the leading quote char. */ |
291 | 63.3k | s++; |
292 | 63.3k | len = strlen(s); |
293 | | // gh-120155: 's' contains at least the trailing quote, |
294 | | // so the code '--len' below is safe. |
295 | 63.3k | assert(len >= 1); |
296 | | |
297 | 63.3k | if (len > INT_MAX) { |
298 | 0 | PyErr_SetString(PyExc_OverflowError, "string to parse is too long"); |
299 | 0 | return NULL; |
300 | 0 | } |
301 | 63.3k | if (s[--len] != quote) { |
302 | | /* Last quote char must match the first. */ |
303 | 0 | PyErr_BadInternalCall(); |
304 | 0 | return NULL; |
305 | 0 | } |
306 | 63.3k | if (len >= 4 && s[0] == quote && s[1] == quote) { |
307 | | /* A triple quoted string. We've already skipped one quote at |
308 | | the start and one at the end of the string. Now skip the |
309 | | two at the start. */ |
310 | 2.49k | s += 2; |
311 | 2.49k | len -= 2; |
312 | | /* And check that the last two match. */ |
313 | 2.49k | if (s[--len] != quote || s[--len] != quote) { |
314 | 0 | PyErr_BadInternalCall(); |
315 | 0 | return NULL; |
316 | 0 | } |
317 | 2.49k | } |
318 | | |
319 | | /* Avoid invoking escape decoding routines if possible. */ |
320 | 63.3k | rawmode = rawmode || strchr(s, '\\') == NULL; |
321 | 63.3k | if (bytesmode) { |
322 | | /* Disallow non-ASCII characters. */ |
323 | 5.78k | const char *ch; |
324 | 75.6k | for (ch = s; *ch; ch++) { |
325 | 69.9k | if (Py_CHARMASK(*ch) >= 0x80) { |
326 | 12 | RAISE_SYNTAX_ERROR_KNOWN_LOCATION( |
327 | 12 | t, |
328 | 12 | "bytes can only contain ASCII " |
329 | 12 | "literal characters"); |
330 | 12 | return NULL; |
331 | 12 | } |
332 | 69.9k | } |
333 | 5.77k | if (rawmode) { |
334 | 2.88k | return PyBytes_FromStringAndSize(s, (Py_ssize_t)len); |
335 | 2.88k | } |
336 | 2.88k | return decode_bytes_with_escapes(p, s, (Py_ssize_t)len, t); |
337 | 5.77k | } |
338 | 57.5k | return _PyPegen_decode_string(p, rawmode, s, len, t); |
339 | 63.3k | } |