/src/cpython/Objects/unicode_format.c
Line | Count | Source |
1 | | /* |
2 | | |
3 | | Unicode implementation based on original code by Fredrik Lundh, |
4 | | modified by Marc-Andre Lemburg <mal@lemburg.com>. |
5 | | |
6 | | Major speed upgrades to the method implementations at the Reykjavik |
7 | | NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. |
8 | | |
9 | | Copyright (c) Corporation for National Research Initiatives. |
10 | | |
11 | | -------------------------------------------------------------------- |
12 | | The original string type implementation is: |
13 | | |
14 | | Copyright (c) 1999 by Secret Labs AB |
15 | | Copyright (c) 1999 by Fredrik Lundh |
16 | | |
17 | | By obtaining, using, and/or copying this software and/or its |
18 | | associated documentation, you agree that you have read, understood, |
19 | | and will comply with the following terms and conditions: |
20 | | |
21 | | Permission to use, copy, modify, and distribute this software and its |
22 | | associated documentation for any purpose and without fee is hereby |
23 | | granted, provided that the above copyright notice appears in all |
24 | | copies, and that both that copyright notice and this permission notice |
25 | | appear in supporting documentation, and that the name of Secret Labs |
26 | | AB or the author not be used in advertising or publicity pertaining to |
27 | | distribution of the software without specific, written prior |
28 | | permission. |
29 | | |
30 | | SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO |
31 | | THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
32 | | FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR |
33 | | ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
34 | | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
35 | | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT |
36 | | OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
37 | | -------------------------------------------------------------------- |
38 | | |
39 | | */ |
40 | | |
41 | | // PyUnicode_Format() implementation |
42 | | |
43 | | #include "Python.h" |
44 | | #include "pycore_abstract.h" // _PyIndex_Check() |
45 | | #include "pycore_format.h" // F_ALT |
46 | | #include "pycore_long.h" // _PyLong_FormatWriter() |
47 | | #include "pycore_object.h" // _PyObject_IsUniquelyReferenced() |
48 | | #include "pycore_unicodeobject.h" // _Py_MAX_UNICODE |
49 | | |
50 | | |
51 | 0 | #define MAX_UNICODE _Py_MAX_UNICODE |
52 | 23.3M | #define ensure_unicode _PyUnicode_EnsureUnicode |
53 | | |
54 | | struct unicode_formatter_t { |
55 | | PyObject *args; |
56 | | int args_owned; |
57 | | Py_ssize_t arglen, argidx; |
58 | | PyObject *dict; |
59 | | |
60 | | int fmtkind; |
61 | | Py_ssize_t fmtcnt, fmtpos; |
62 | | const void *fmtdata; |
63 | | PyObject *fmtstr; |
64 | | |
65 | | _PyUnicodeWriter writer; |
66 | | }; |
67 | | |
68 | | |
69 | | struct unicode_format_arg_t { |
70 | | Py_UCS4 ch; |
71 | | int flags; |
72 | | Py_ssize_t width; |
73 | | int prec; |
74 | | int sign; |
75 | | }; |
76 | | |
77 | | |
78 | | static PyObject * |
79 | | unicode_format_getnextarg(struct unicode_formatter_t *ctx) |
80 | 45.4M | { |
81 | 45.4M | Py_ssize_t argidx = ctx->argidx; |
82 | | |
83 | 45.4M | if (argidx < ctx->arglen) { |
84 | 45.4M | ctx->argidx++; |
85 | 45.4M | if (ctx->arglen < 0) |
86 | 17.8M | return ctx->args; |
87 | 27.6M | else |
88 | 27.6M | return PyTuple_GetItem(ctx->args, argidx); |
89 | 45.4M | } |
90 | 0 | PyErr_SetString(PyExc_TypeError, |
91 | 0 | "not enough arguments for format string"); |
92 | 0 | return NULL; |
93 | 45.4M | } |
94 | | |
95 | | |
96 | | /* Returns a new reference to a PyUnicode object, or NULL on failure. */ |
97 | | |
98 | | /* Format a float into the writer if the writer is not NULL, or into *p_output |
99 | | otherwise. |
100 | | |
101 | | Return 0 on success, raise an exception and return -1 on error. */ |
102 | | static int |
103 | | formatfloat(PyObject *v, struct unicode_format_arg_t *arg, |
104 | | PyObject **p_output, |
105 | | _PyUnicodeWriter *writer) |
106 | 0 | { |
107 | 0 | char *p; |
108 | 0 | double x; |
109 | 0 | Py_ssize_t len; |
110 | 0 | int prec; |
111 | 0 | int dtoa_flags = 0; |
112 | |
|
113 | 0 | x = PyFloat_AsDouble(v); |
114 | 0 | if (x == -1.0 && PyErr_Occurred()) |
115 | 0 | return -1; |
116 | | |
117 | 0 | prec = arg->prec; |
118 | 0 | if (prec < 0) |
119 | 0 | prec = 6; |
120 | |
|
121 | 0 | if (arg->flags & F_ALT) |
122 | 0 | dtoa_flags |= Py_DTSF_ALT; |
123 | 0 | p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL); |
124 | 0 | if (p == NULL) |
125 | 0 | return -1; |
126 | 0 | len = strlen(p); |
127 | 0 | if (writer) { |
128 | 0 | if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) { |
129 | 0 | PyMem_Free(p); |
130 | 0 | return -1; |
131 | 0 | } |
132 | 0 | } |
133 | 0 | else |
134 | 0 | *p_output = _PyUnicode_FromASCII(p, len); |
135 | 0 | PyMem_Free(p); |
136 | 0 | return 0; |
137 | 0 | } |
138 | | |
139 | | |
140 | | /* formatlong() emulates the format codes d, u, o, x and X, and |
141 | | * the F_ALT flag, for Python's long (unbounded) ints. It's not used for |
142 | | * Python's regular ints. |
143 | | * Return value: a new PyUnicodeObject*, or NULL if error. |
144 | | * The output string is of the form |
145 | | * "-"? ("0x" | "0X")? digit+ |
146 | | * "0x"/"0X" are present only for x and X conversions, with F_ALT |
147 | | * set in flags. The case of hex digits will be correct, |
148 | | * There will be at least prec digits, zero-filled on the left if |
149 | | * necessary to get that many. |
150 | | * val object to be converted |
151 | | * flags bitmask of format flags; only F_ALT is looked at |
152 | | * prec minimum number of digits; 0-fill on left if needed |
153 | | * type a character in [duoxX]; u acts the same as d |
154 | | * |
155 | | * CAUTION: o, x and X conversions on regular ints can never |
156 | | * produce a '-' sign, but can for Python's unbounded ints. |
157 | | */ |
158 | | PyObject * |
159 | | _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type) |
160 | 1.53k | { |
161 | 1.53k | PyObject *result = NULL; |
162 | 1.53k | char *buf; |
163 | 1.53k | Py_ssize_t i; |
164 | 1.53k | int sign; /* 1 if '-', else 0 */ |
165 | 1.53k | int len; /* number of characters */ |
166 | 1.53k | Py_ssize_t llen; |
167 | 1.53k | int numdigits; /* len == numnondigits + numdigits */ |
168 | 1.53k | int numnondigits = 0; |
169 | | |
170 | | /* Avoid exceeding SSIZE_T_MAX */ |
171 | 1.53k | if (prec > INT_MAX-3) { |
172 | 0 | PyErr_SetString(PyExc_OverflowError, |
173 | 0 | "precision too large"); |
174 | 0 | return NULL; |
175 | 0 | } |
176 | | |
177 | 1.53k | assert(PyLong_Check(val)); |
178 | | |
179 | 1.53k | switch (type) { |
180 | 0 | default: |
181 | 0 | Py_UNREACHABLE(); |
182 | 0 | case 'd': |
183 | 0 | case 'i': |
184 | 0 | case 'u': |
185 | | /* int and int subclasses should print numerically when a numeric */ |
186 | | /* format code is used (see issue18780) */ |
187 | 0 | result = PyNumber_ToBase(val, 10); |
188 | 0 | break; |
189 | 0 | case 'o': |
190 | 0 | numnondigits = 2; |
191 | 0 | result = PyNumber_ToBase(val, 8); |
192 | 0 | break; |
193 | 0 | case 'x': |
194 | 1.53k | case 'X': |
195 | 1.53k | numnondigits = 2; |
196 | 1.53k | result = PyNumber_ToBase(val, 16); |
197 | 1.53k | break; |
198 | 1.53k | } |
199 | 1.53k | if (!result) |
200 | 0 | return NULL; |
201 | | |
202 | 1.53k | assert(_PyUnicode_IsModifiable(result)); |
203 | 1.53k | assert(PyUnicode_IS_ASCII(result)); |
204 | | |
205 | | /* To modify the string in-place, there can only be one reference. */ |
206 | 1.53k | if (!_PyObject_IsUniquelyReferenced(result)) { |
207 | 0 | Py_DECREF(result); |
208 | 0 | PyErr_BadInternalCall(); |
209 | 0 | return NULL; |
210 | 0 | } |
211 | 1.53k | buf = PyUnicode_DATA(result); |
212 | 1.53k | llen = PyUnicode_GET_LENGTH(result); |
213 | 1.53k | if (llen > INT_MAX) { |
214 | 0 | Py_DECREF(result); |
215 | 0 | PyErr_SetString(PyExc_ValueError, |
216 | 0 | "string too large in _PyUnicode_FormatLong"); |
217 | 0 | return NULL; |
218 | 0 | } |
219 | 1.53k | len = (int)llen; |
220 | 1.53k | sign = buf[0] == '-'; |
221 | 1.53k | numnondigits += sign; |
222 | 1.53k | numdigits = len - numnondigits; |
223 | 1.53k | assert(numdigits > 0); |
224 | | |
225 | | /* Get rid of base marker unless F_ALT */ |
226 | 1.53k | if (((alt) == 0 && |
227 | 1.53k | (type == 'o' || type == 'x' || type == 'X'))) { |
228 | 1.53k | assert(buf[sign] == '0'); |
229 | 1.53k | assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || |
230 | 1.53k | buf[sign+1] == 'o'); |
231 | 1.53k | numnondigits -= 2; |
232 | 1.53k | buf += 2; |
233 | 1.53k | len -= 2; |
234 | 1.53k | if (sign) |
235 | 0 | buf[0] = '-'; |
236 | 1.53k | assert(len == numnondigits + numdigits); |
237 | 1.53k | assert(numdigits > 0); |
238 | 1.53k | } |
239 | | |
240 | | /* Fill with leading zeroes to meet minimum width. */ |
241 | 1.53k | if (prec > numdigits) { |
242 | 0 | PyObject *r1 = PyBytes_FromStringAndSize(NULL, |
243 | 0 | numnondigits + prec); |
244 | 0 | char *b1; |
245 | 0 | if (!r1) { |
246 | 0 | Py_DECREF(result); |
247 | 0 | return NULL; |
248 | 0 | } |
249 | 0 | b1 = PyBytes_AS_STRING(r1); |
250 | 0 | for (i = 0; i < numnondigits; ++i) |
251 | 0 | *b1++ = *buf++; |
252 | 0 | for (i = 0; i < prec - numdigits; i++) |
253 | 0 | *b1++ = '0'; |
254 | 0 | for (i = 0; i < numdigits; i++) |
255 | 0 | *b1++ = *buf++; |
256 | 0 | *b1 = '\0'; |
257 | 0 | Py_SETREF(result, r1); |
258 | 0 | buf = PyBytes_AS_STRING(result); |
259 | 0 | len = numnondigits + prec; |
260 | 0 | } |
261 | | |
262 | | /* Fix up case for hex conversions. */ |
263 | 1.53k | if (type == 'X') { |
264 | | /* Need to convert all lower case letters to upper case. |
265 | | and need to convert 0x to 0X (and -0x to -0X). */ |
266 | 4.51k | for (i = 0; i < len; i++) |
267 | 2.97k | if (buf[i] >= 'a' && buf[i] <= 'x') |
268 | 1.15k | buf[i] -= 'a'-'A'; |
269 | 1.53k | } |
270 | 1.53k | if (!PyUnicode_Check(result) |
271 | 1.53k | || buf != PyUnicode_DATA(result)) { |
272 | 1.53k | PyObject *unicode; |
273 | 1.53k | unicode = _PyUnicode_FromASCII(buf, len); |
274 | 1.53k | Py_SETREF(result, unicode); |
275 | 1.53k | } |
276 | 0 | else if (len != PyUnicode_GET_LENGTH(result)) { |
277 | 0 | if (PyUnicode_Resize(&result, len) < 0) |
278 | 0 | Py_CLEAR(result); |
279 | 0 | } |
280 | 1.53k | return result; |
281 | 1.53k | } |
282 | | |
283 | | |
284 | | /* Format an integer or a float as an integer. |
285 | | * Return 1 if the number has been formatted into the writer, |
286 | | * 0 if the number has been formatted into *p_output |
287 | | * -1 and raise an exception on error */ |
288 | | static int |
289 | | mainformatlong(PyObject *v, |
290 | | struct unicode_format_arg_t *arg, |
291 | | PyObject **p_output, |
292 | | _PyUnicodeWriter *writer) |
293 | 11.1M | { |
294 | 11.1M | PyObject *iobj, *res; |
295 | 11.1M | char type = (char)arg->ch; |
296 | | |
297 | 11.1M | if (!PyNumber_Check(v)) |
298 | 4.23M | goto wrongtype; |
299 | | |
300 | | /* make sure number is a type of integer for o, x, and X */ |
301 | 6.87M | if (!PyLong_Check(v)) { |
302 | 0 | if (type == 'o' || type == 'x' || type == 'X') { |
303 | 0 | iobj = _PyNumber_Index(v); |
304 | 0 | } |
305 | 0 | else { |
306 | 0 | iobj = PyNumber_Long(v); |
307 | 0 | } |
308 | 0 | if (iobj == NULL ) { |
309 | 0 | if (PyErr_ExceptionMatches(PyExc_TypeError)) |
310 | 0 | goto wrongtype; |
311 | 0 | return -1; |
312 | 0 | } |
313 | 0 | assert(PyLong_Check(iobj)); |
314 | 0 | } |
315 | 6.87M | else { |
316 | 6.87M | iobj = Py_NewRef(v); |
317 | 6.87M | } |
318 | | |
319 | 6.87M | if (PyLong_CheckExact(v) |
320 | 6.87M | && arg->width == -1 && arg->prec == -1 |
321 | 6.86M | && !(arg->flags & (F_SIGN | F_BLANK)) |
322 | 6.86M | && type != 'X') |
323 | 6.86M | { |
324 | | /* Fast path */ |
325 | 6.86M | int alternate = arg->flags & F_ALT; |
326 | 6.86M | int base; |
327 | | |
328 | 6.86M | switch(type) |
329 | 6.86M | { |
330 | 0 | default: |
331 | 0 | Py_UNREACHABLE(); |
332 | 6.86M | case 'd': |
333 | 6.86M | case 'i': |
334 | 6.86M | case 'u': |
335 | 6.86M | base = 10; |
336 | 6.86M | break; |
337 | 0 | case 'o': |
338 | 0 | base = 8; |
339 | 0 | break; |
340 | 0 | case 'x': |
341 | 0 | case 'X': |
342 | 0 | base = 16; |
343 | 0 | break; |
344 | 6.86M | } |
345 | | |
346 | 6.86M | if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) { |
347 | 0 | Py_DECREF(iobj); |
348 | 0 | return -1; |
349 | 0 | } |
350 | 6.86M | Py_DECREF(iobj); |
351 | 6.86M | return 1; |
352 | 6.86M | } |
353 | | |
354 | 1.53k | res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type); |
355 | 1.53k | Py_DECREF(iobj); |
356 | 1.53k | if (res == NULL) |
357 | 0 | return -1; |
358 | 1.53k | *p_output = res; |
359 | 1.53k | return 0; |
360 | | |
361 | 4.23M | wrongtype: |
362 | 4.23M | switch(type) |
363 | 4.23M | { |
364 | 0 | case 'o': |
365 | 0 | case 'x': |
366 | 0 | case 'X': |
367 | 0 | PyErr_Format(PyExc_TypeError, |
368 | 0 | "%%%c format: an integer is required, " |
369 | 0 | "not %.200s", |
370 | 0 | type, Py_TYPE(v)->tp_name); |
371 | 0 | break; |
372 | 4.23M | default: |
373 | 4.23M | PyErr_Format(PyExc_TypeError, |
374 | 4.23M | "%%%c format: a real number is required, " |
375 | 4.23M | "not %.200s", |
376 | 4.23M | type, Py_TYPE(v)->tp_name); |
377 | 4.23M | break; |
378 | 4.23M | } |
379 | 4.23M | return -1; |
380 | 4.23M | } |
381 | | |
382 | | |
383 | | static Py_UCS4 |
384 | | formatchar(PyObject *v) |
385 | 0 | { |
386 | | /* presume that the buffer is at least 3 characters long */ |
387 | 0 | if (PyUnicode_Check(v)) { |
388 | 0 | if (PyUnicode_GET_LENGTH(v) == 1) { |
389 | 0 | return PyUnicode_READ_CHAR(v, 0); |
390 | 0 | } |
391 | 0 | PyErr_Format(PyExc_TypeError, |
392 | 0 | "%%c requires an int or a unicode character, " |
393 | 0 | "not a string of length %zd", |
394 | 0 | PyUnicode_GET_LENGTH(v)); |
395 | 0 | return (Py_UCS4) -1; |
396 | 0 | } |
397 | 0 | else { |
398 | 0 | int overflow; |
399 | 0 | long x = PyLong_AsLongAndOverflow(v, &overflow); |
400 | 0 | if (x == -1 && PyErr_Occurred()) { |
401 | 0 | if (PyErr_ExceptionMatches(PyExc_TypeError)) { |
402 | 0 | PyErr_Format(PyExc_TypeError, |
403 | 0 | "%%c requires an int or a unicode character, not %T", |
404 | 0 | v); |
405 | 0 | return (Py_UCS4) -1; |
406 | 0 | } |
407 | 0 | return (Py_UCS4) -1; |
408 | 0 | } |
409 | | |
410 | 0 | if (x < 0 || x > MAX_UNICODE) { |
411 | | /* this includes an overflow in converting to C long */ |
412 | 0 | PyErr_SetString(PyExc_OverflowError, |
413 | 0 | "%c arg not in range(0x110000)"); |
414 | 0 | return (Py_UCS4) -1; |
415 | 0 | } |
416 | | |
417 | 0 | return (Py_UCS4) x; |
418 | 0 | } |
419 | 0 | } |
420 | | |
421 | | |
422 | | /* Parse options of an argument: flags, width, precision. |
423 | | Handle also "%(name)" syntax. |
424 | | |
425 | | Return 0 if the argument has been formatted into arg->str. |
426 | | Return 1 if the argument has been written into ctx->writer, |
427 | | Raise an exception and return -1 on error. */ |
428 | | static int |
429 | | unicode_format_arg_parse(struct unicode_formatter_t *ctx, |
430 | | struct unicode_format_arg_t *arg) |
431 | 45.4M | { |
432 | 45.4M | #define FORMAT_READ(ctx) \ |
433 | 45.8M | PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos) |
434 | | |
435 | 45.4M | PyObject *v; |
436 | | |
437 | 45.4M | if (arg->ch == '(') { |
438 | | /* Get argument value from a dictionary. Example: "%(name)s". */ |
439 | 39.1k | Py_ssize_t keystart; |
440 | 39.1k | Py_ssize_t keylen; |
441 | 39.1k | PyObject *key; |
442 | 39.1k | int pcount = 1; |
443 | | |
444 | 39.1k | if (ctx->dict == NULL) { |
445 | 0 | PyErr_SetString(PyExc_TypeError, |
446 | 0 | "format requires a mapping"); |
447 | 0 | return -1; |
448 | 0 | } |
449 | 39.1k | ++ctx->fmtpos; |
450 | 39.1k | --ctx->fmtcnt; |
451 | 39.1k | keystart = ctx->fmtpos; |
452 | | /* Skip over balanced parentheses */ |
453 | 352k | while (pcount > 0 && --ctx->fmtcnt >= 0) { |
454 | 313k | arg->ch = FORMAT_READ(ctx); |
455 | 313k | if (arg->ch == ')') |
456 | 39.1k | --pcount; |
457 | 273k | else if (arg->ch == '(') |
458 | 0 | ++pcount; |
459 | 313k | ctx->fmtpos++; |
460 | 313k | } |
461 | 39.1k | keylen = ctx->fmtpos - keystart - 1; |
462 | 39.1k | if (ctx->fmtcnt < 0 || pcount > 0) { |
463 | 0 | PyErr_SetString(PyExc_ValueError, |
464 | 0 | "incomplete format key"); |
465 | 0 | return -1; |
466 | 0 | } |
467 | 39.1k | key = PyUnicode_Substring(ctx->fmtstr, |
468 | 39.1k | keystart, keystart + keylen); |
469 | 39.1k | if (key == NULL) |
470 | 0 | return -1; |
471 | 39.1k | if (ctx->args_owned) { |
472 | 27.9k | ctx->args_owned = 0; |
473 | 27.9k | Py_DECREF(ctx->args); |
474 | 27.9k | } |
475 | 39.1k | ctx->args = PyObject_GetItem(ctx->dict, key); |
476 | 39.1k | Py_DECREF(key); |
477 | 39.1k | if (ctx->args == NULL) |
478 | 0 | return -1; |
479 | 39.1k | ctx->args_owned = 1; |
480 | 39.1k | ctx->arglen = -1; |
481 | 39.1k | ctx->argidx = -2; |
482 | 39.1k | } |
483 | | |
484 | | /* Parse flags. Example: "%+i" => flags=F_SIGN. */ |
485 | 45.4M | while (--ctx->fmtcnt >= 0) { |
486 | 45.4M | arg->ch = FORMAT_READ(ctx); |
487 | 45.4M | ctx->fmtpos++; |
488 | 45.4M | switch (arg->ch) { |
489 | 0 | case '-': arg->flags |= F_LJUST; continue; |
490 | 0 | case '+': arg->flags |= F_SIGN; continue; |
491 | 0 | case ' ': arg->flags |= F_BLANK; continue; |
492 | 0 | case '#': arg->flags |= F_ALT; continue; |
493 | 1.53k | case '0': arg->flags |= F_ZERO; continue; |
494 | 45.4M | } |
495 | 45.4M | break; |
496 | 45.4M | } |
497 | | |
498 | | /* Parse width. Example: "%10s" => width=10 */ |
499 | 45.4M | if (arg->ch == '*') { |
500 | 0 | v = unicode_format_getnextarg(ctx); |
501 | 0 | if (v == NULL) |
502 | 0 | return -1; |
503 | 0 | if (!PyLong_Check(v)) { |
504 | 0 | PyErr_SetString(PyExc_TypeError, |
505 | 0 | "* wants int"); |
506 | 0 | return -1; |
507 | 0 | } |
508 | 0 | arg->width = PyLong_AsSsize_t(v); |
509 | 0 | if (arg->width == -1 && PyErr_Occurred()) |
510 | 0 | return -1; |
511 | 0 | if (arg->width < 0) { |
512 | 0 | arg->flags |= F_LJUST; |
513 | 0 | arg->width = -arg->width; |
514 | 0 | } |
515 | 0 | if (--ctx->fmtcnt >= 0) { |
516 | 0 | arg->ch = FORMAT_READ(ctx); |
517 | 0 | ctx->fmtpos++; |
518 | 0 | } |
519 | 0 | } |
520 | 45.4M | else if (arg->ch >= '0' && arg->ch <= '9') { |
521 | 1.53k | arg->width = arg->ch - '0'; |
522 | 1.53k | while (--ctx->fmtcnt >= 0) { |
523 | 1.53k | arg->ch = FORMAT_READ(ctx); |
524 | 1.53k | ctx->fmtpos++; |
525 | 1.53k | if (arg->ch < '0' || arg->ch > '9') |
526 | 1.53k | break; |
527 | | /* Since arg->ch is unsigned, the RHS would end up as unsigned, |
528 | | mixing signed and unsigned comparison. Since arg->ch is between |
529 | | '0' and '9', casting to int is safe. */ |
530 | 0 | if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) { |
531 | 0 | PyErr_SetString(PyExc_ValueError, |
532 | 0 | "width too big"); |
533 | 0 | return -1; |
534 | 0 | } |
535 | 0 | arg->width = arg->width*10 + (arg->ch - '0'); |
536 | 0 | } |
537 | 1.53k | } |
538 | | |
539 | | /* Parse precision. Example: "%.3f" => prec=3 */ |
540 | 45.4M | if (arg->ch == '.') { |
541 | 0 | arg->prec = 0; |
542 | 0 | if (--ctx->fmtcnt >= 0) { |
543 | 0 | arg->ch = FORMAT_READ(ctx); |
544 | 0 | ctx->fmtpos++; |
545 | 0 | } |
546 | 0 | if (arg->ch == '*') { |
547 | 0 | v = unicode_format_getnextarg(ctx); |
548 | 0 | if (v == NULL) |
549 | 0 | return -1; |
550 | 0 | if (!PyLong_Check(v)) { |
551 | 0 | PyErr_SetString(PyExc_TypeError, |
552 | 0 | "* wants int"); |
553 | 0 | return -1; |
554 | 0 | } |
555 | 0 | arg->prec = PyLong_AsInt(v); |
556 | 0 | if (arg->prec == -1 && PyErr_Occurred()) |
557 | 0 | return -1; |
558 | 0 | if (arg->prec < 0) |
559 | 0 | arg->prec = 0; |
560 | 0 | if (--ctx->fmtcnt >= 0) { |
561 | 0 | arg->ch = FORMAT_READ(ctx); |
562 | 0 | ctx->fmtpos++; |
563 | 0 | } |
564 | 0 | } |
565 | 0 | else if (arg->ch >= '0' && arg->ch <= '9') { |
566 | 0 | arg->prec = arg->ch - '0'; |
567 | 0 | while (--ctx->fmtcnt >= 0) { |
568 | 0 | arg->ch = FORMAT_READ(ctx); |
569 | 0 | ctx->fmtpos++; |
570 | 0 | if (arg->ch < '0' || arg->ch > '9') |
571 | 0 | break; |
572 | 0 | if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) { |
573 | 0 | PyErr_SetString(PyExc_ValueError, |
574 | 0 | "precision too big"); |
575 | 0 | return -1; |
576 | 0 | } |
577 | 0 | arg->prec = arg->prec*10 + (arg->ch - '0'); |
578 | 0 | } |
579 | 0 | } |
580 | 0 | } |
581 | | |
582 | | /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */ |
583 | 45.4M | if (ctx->fmtcnt >= 0) { |
584 | 45.4M | if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') { |
585 | 0 | if (--ctx->fmtcnt >= 0) { |
586 | 0 | arg->ch = FORMAT_READ(ctx); |
587 | 0 | ctx->fmtpos++; |
588 | 0 | } |
589 | 0 | } |
590 | 45.4M | } |
591 | 45.4M | if (ctx->fmtcnt < 0) { |
592 | 0 | PyErr_SetString(PyExc_ValueError, |
593 | 0 | "incomplete format"); |
594 | 0 | return -1; |
595 | 0 | } |
596 | 45.4M | return 0; |
597 | | |
598 | 45.4M | #undef FORMAT_READ |
599 | 45.4M | } |
600 | | |
601 | | |
602 | | /* Format one argument. Supported conversion specifiers: |
603 | | |
604 | | - "s", "r", "a": any type |
605 | | - "i", "d", "u": int or float |
606 | | - "o", "x", "X": int |
607 | | - "e", "E", "f", "F", "g", "G": float |
608 | | - "c": int or str (1 character) |
609 | | |
610 | | When possible, the output is written directly into the Unicode writer |
611 | | (ctx->writer). A string is created when padding is required. |
612 | | |
613 | | Return 0 if the argument has been formatted into *p_str, |
614 | | 1 if the argument has been written into ctx->writer, |
615 | | -1 on error. */ |
616 | | static int |
617 | | unicode_format_arg_format(struct unicode_formatter_t *ctx, |
618 | | struct unicode_format_arg_t *arg, |
619 | | PyObject **p_str) |
620 | 45.4M | { |
621 | 45.4M | PyObject *v; |
622 | 45.4M | _PyUnicodeWriter *writer = &ctx->writer; |
623 | | |
624 | 45.4M | if (ctx->fmtcnt == 0) |
625 | 10.9M | ctx->writer.overallocate = 0; |
626 | | |
627 | 45.4M | v = unicode_format_getnextarg(ctx); |
628 | 45.4M | if (v == NULL) |
629 | 0 | return -1; |
630 | | |
631 | | |
632 | 45.4M | switch (arg->ch) { |
633 | 34.3M | case 's': |
634 | 34.3M | case 'r': |
635 | 34.3M | case 'a': |
636 | 34.3M | if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) { |
637 | | /* Fast path */ |
638 | 0 | if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1) |
639 | 0 | return -1; |
640 | 0 | return 1; |
641 | 0 | } |
642 | | |
643 | 34.3M | if (PyUnicode_CheckExact(v) && arg->ch == 's') { |
644 | 34.3M | *p_str = Py_NewRef(v); |
645 | 34.3M | } |
646 | 0 | else { |
647 | 0 | if (arg->ch == 's') |
648 | 0 | *p_str = PyObject_Str(v); |
649 | 0 | else if (arg->ch == 'r') |
650 | 0 | *p_str = PyObject_Repr(v); |
651 | 0 | else |
652 | 0 | *p_str = PyObject_ASCII(v); |
653 | 0 | } |
654 | 34.3M | break; |
655 | | |
656 | 0 | case 'i': |
657 | 11.1M | case 'd': |
658 | 11.1M | case 'u': |
659 | 11.1M | case 'o': |
660 | 11.1M | case 'x': |
661 | 11.1M | case 'X': |
662 | 11.1M | { |
663 | 11.1M | int ret = mainformatlong(v, arg, p_str, writer); |
664 | 11.1M | if (ret != 0) |
665 | 11.1M | return ret; |
666 | 1.53k | arg->sign = 1; |
667 | 1.53k | break; |
668 | 11.1M | } |
669 | | |
670 | 0 | case 'e': |
671 | 0 | case 'E': |
672 | 0 | case 'f': |
673 | 0 | case 'F': |
674 | 0 | case 'g': |
675 | 0 | case 'G': |
676 | 0 | if (arg->width == -1 && arg->prec == -1 |
677 | 0 | && !(arg->flags & (F_SIGN | F_BLANK))) |
678 | 0 | { |
679 | | /* Fast path */ |
680 | 0 | if (formatfloat(v, arg, NULL, writer) == -1) |
681 | 0 | return -1; |
682 | 0 | return 1; |
683 | 0 | } |
684 | | |
685 | 0 | arg->sign = 1; |
686 | 0 | if (formatfloat(v, arg, p_str, NULL) == -1) |
687 | 0 | return -1; |
688 | 0 | break; |
689 | | |
690 | 0 | case 'c': |
691 | 0 | { |
692 | 0 | Py_UCS4 ch = formatchar(v); |
693 | 0 | if (ch == (Py_UCS4) -1) |
694 | 0 | return -1; |
695 | 0 | if (arg->width == -1 && arg->prec == -1) { |
696 | | /* Fast path */ |
697 | 0 | if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) |
698 | 0 | return -1; |
699 | 0 | return 1; |
700 | 0 | } |
701 | 0 | *p_str = PyUnicode_FromOrdinal(ch); |
702 | 0 | break; |
703 | 0 | } |
704 | | |
705 | 0 | default: |
706 | 0 | PyErr_Format(PyExc_ValueError, |
707 | 0 | "unsupported format character '%c' (0x%x) " |
708 | 0 | "at index %zd", |
709 | 0 | (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?', |
710 | 0 | (int)arg->ch, |
711 | 0 | ctx->fmtpos - 1); |
712 | 0 | return -1; |
713 | 45.4M | } |
714 | 34.3M | if (*p_str == NULL) |
715 | 0 | return -1; |
716 | 34.3M | assert (PyUnicode_Check(*p_str)); |
717 | 34.3M | return 0; |
718 | 34.3M | } |
719 | | |
720 | | |
721 | | static int |
722 | | unicode_format_arg_output(struct unicode_formatter_t *ctx, |
723 | | struct unicode_format_arg_t *arg, |
724 | | PyObject *str) |
725 | 34.3M | { |
726 | 34.3M | Py_ssize_t len; |
727 | 34.3M | int kind; |
728 | 34.3M | const void *pbuf; |
729 | 34.3M | Py_ssize_t pindex; |
730 | 34.3M | Py_UCS4 signchar; |
731 | 34.3M | Py_ssize_t buflen; |
732 | 34.3M | Py_UCS4 maxchar; |
733 | 34.3M | Py_ssize_t sublen; |
734 | 34.3M | _PyUnicodeWriter *writer = &ctx->writer; |
735 | 34.3M | Py_UCS4 fill; |
736 | | |
737 | 34.3M | fill = ' '; |
738 | 34.3M | if (arg->sign && arg->flags & F_ZERO) |
739 | 1.53k | fill = '0'; |
740 | | |
741 | 34.3M | len = PyUnicode_GET_LENGTH(str); |
742 | 34.3M | if ((arg->width == -1 || arg->width <= len) |
743 | 34.3M | && (arg->prec == -1 || arg->prec >= len) |
744 | 34.3M | && !(arg->flags & (F_SIGN | F_BLANK))) |
745 | 34.3M | { |
746 | | /* Fast path */ |
747 | 34.3M | if (_PyUnicodeWriter_WriteStr(writer, str) == -1) |
748 | 0 | return -1; |
749 | 34.3M | return 0; |
750 | 34.3M | } |
751 | | |
752 | | /* Truncate the string for "s", "r" and "a" formats |
753 | | if the precision is set */ |
754 | 96 | if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') { |
755 | 0 | if (arg->prec >= 0 && len > arg->prec) |
756 | 0 | len = arg->prec; |
757 | 0 | } |
758 | | |
759 | | /* Adjust sign and width */ |
760 | 96 | kind = PyUnicode_KIND(str); |
761 | 96 | pbuf = PyUnicode_DATA(str); |
762 | 96 | pindex = 0; |
763 | 96 | signchar = '\0'; |
764 | 96 | if (arg->sign) { |
765 | 96 | Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); |
766 | 96 | if (ch == '-' || ch == '+') { |
767 | 0 | signchar = ch; |
768 | 0 | len--; |
769 | 0 | pindex++; |
770 | 0 | } |
771 | 96 | else if (arg->flags & F_SIGN) |
772 | 0 | signchar = '+'; |
773 | 96 | else if (arg->flags & F_BLANK) |
774 | 0 | signchar = ' '; |
775 | 96 | else |
776 | 96 | arg->sign = 0; |
777 | 96 | } |
778 | 96 | if (arg->width < len) |
779 | 0 | arg->width = len; |
780 | | |
781 | | /* Prepare the writer */ |
782 | 96 | maxchar = writer->maxchar; |
783 | 96 | if (!(arg->flags & F_LJUST)) { |
784 | 96 | if (arg->sign) { |
785 | 0 | if ((arg->width-1) > len) |
786 | 0 | maxchar = Py_MAX(maxchar, fill); |
787 | 0 | } |
788 | 96 | else { |
789 | 96 | if (arg->width > len) |
790 | 96 | maxchar = Py_MAX(maxchar, fill); |
791 | 96 | } |
792 | 96 | } |
793 | 96 | if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) { |
794 | 0 | Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len); |
795 | 0 | maxchar = Py_MAX(maxchar, strmaxchar); |
796 | 0 | } |
797 | | |
798 | 96 | buflen = arg->width; |
799 | 96 | if (arg->sign && len == arg->width) |
800 | 0 | buflen++; |
801 | 96 | if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1) |
802 | 0 | return -1; |
803 | | |
804 | | /* Write the sign if needed */ |
805 | 96 | if (arg->sign) { |
806 | 0 | if (fill != ' ') { |
807 | 0 | PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); |
808 | 0 | writer->pos += 1; |
809 | 0 | } |
810 | 0 | if (arg->width > len) |
811 | 0 | arg->width--; |
812 | 0 | } |
813 | | |
814 | | /* Write the numeric prefix for "x", "X" and "o" formats |
815 | | if the alternate form is used. |
816 | | For example, write "0x" for the "%#x" format. */ |
817 | 96 | if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { |
818 | 0 | assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); |
819 | 0 | assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch); |
820 | 0 | if (fill != ' ') { |
821 | 0 | PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); |
822 | 0 | PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); |
823 | 0 | writer->pos += 2; |
824 | 0 | pindex += 2; |
825 | 0 | } |
826 | 0 | arg->width -= 2; |
827 | 0 | if (arg->width < 0) |
828 | 0 | arg->width = 0; |
829 | 0 | len -= 2; |
830 | 0 | } |
831 | | |
832 | | /* Pad left with the fill character if needed */ |
833 | 96 | if (arg->width > len && !(arg->flags & F_LJUST)) { |
834 | 96 | sublen = arg->width - len; |
835 | 96 | _PyUnicode_Fill(writer->kind, writer->data, fill, writer->pos, sublen); |
836 | 96 | writer->pos += sublen; |
837 | 96 | arg->width = len; |
838 | 96 | } |
839 | | |
840 | | /* If padding with spaces: write sign if needed and/or numeric prefix if |
841 | | the alternate form is used */ |
842 | 96 | if (fill == ' ') { |
843 | 0 | if (arg->sign) { |
844 | 0 | PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); |
845 | 0 | writer->pos += 1; |
846 | 0 | } |
847 | 0 | if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { |
848 | 0 | assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); |
849 | 0 | assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch); |
850 | 0 | PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); |
851 | 0 | PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); |
852 | 0 | writer->pos += 2; |
853 | 0 | pindex += 2; |
854 | 0 | } |
855 | 0 | } |
856 | | |
857 | | /* Write characters */ |
858 | 96 | if (len) { |
859 | 96 | _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, |
860 | 96 | str, pindex, len); |
861 | 96 | writer->pos += len; |
862 | 96 | } |
863 | | |
864 | | /* Pad right with the fill character if needed */ |
865 | 96 | if (arg->width > len) { |
866 | 0 | sublen = arg->width - len; |
867 | 0 | _PyUnicode_Fill(writer->kind, writer->data, ' ', writer->pos, sublen); |
868 | 0 | writer->pos += sublen; |
869 | 0 | } |
870 | 96 | return 0; |
871 | 96 | } |
872 | | |
873 | | |
874 | | /* Helper of PyUnicode_Format(): format one arg. |
875 | | Return 0 on success, raise an exception and return -1 on error. */ |
876 | | static int |
877 | | unicode_format_arg(struct unicode_formatter_t *ctx) |
878 | 45.4M | { |
879 | 45.4M | struct unicode_format_arg_t arg; |
880 | 45.4M | PyObject *str; |
881 | 45.4M | int ret; |
882 | | |
883 | 45.4M | arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos); |
884 | 45.4M | if (arg.ch == '%') { |
885 | 0 | ctx->fmtpos++; |
886 | 0 | ctx->fmtcnt--; |
887 | 0 | if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0) |
888 | 0 | return -1; |
889 | 0 | return 0; |
890 | 0 | } |
891 | 45.4M | arg.flags = 0; |
892 | 45.4M | arg.width = -1; |
893 | 45.4M | arg.prec = -1; |
894 | 45.4M | arg.sign = 0; |
895 | 45.4M | str = NULL; |
896 | | |
897 | 45.4M | ret = unicode_format_arg_parse(ctx, &arg); |
898 | 45.4M | if (ret == -1) |
899 | 0 | return -1; |
900 | | |
901 | 45.4M | ret = unicode_format_arg_format(ctx, &arg, &str); |
902 | 45.4M | if (ret == -1) |
903 | 4.23M | return -1; |
904 | | |
905 | 41.2M | if (ret != 1) { |
906 | 34.3M | ret = unicode_format_arg_output(ctx, &arg, str); |
907 | 34.3M | Py_DECREF(str); |
908 | 34.3M | if (ret == -1) |
909 | 0 | return -1; |
910 | 34.3M | } |
911 | | |
912 | 41.2M | if (ctx->dict && (ctx->argidx < ctx->arglen)) { |
913 | 0 | PyErr_SetString(PyExc_TypeError, |
914 | 0 | "not all arguments converted during string formatting"); |
915 | 0 | return -1; |
916 | 0 | } |
917 | 41.2M | return 0; |
918 | 41.2M | } |
919 | | |
920 | | |
921 | | PyObject * |
922 | | PyUnicode_Format(PyObject *format, PyObject *args) |
923 | 23.3M | { |
924 | 23.3M | struct unicode_formatter_t ctx; |
925 | | |
926 | 23.3M | if (format == NULL || args == NULL) { |
927 | 0 | PyErr_BadInternalCall(); |
928 | 0 | return NULL; |
929 | 0 | } |
930 | | |
931 | 23.3M | if (ensure_unicode(format) < 0) |
932 | 0 | return NULL; |
933 | | |
934 | 23.3M | ctx.fmtstr = format; |
935 | 23.3M | ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); |
936 | 23.3M | ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); |
937 | 23.3M | ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); |
938 | 23.3M | ctx.fmtpos = 0; |
939 | | |
940 | 23.3M | _PyUnicodeWriter_Init(&ctx.writer); |
941 | 23.3M | ctx.writer.min_length = ctx.fmtcnt + 100; |
942 | 23.3M | ctx.writer.overallocate = 1; |
943 | | |
944 | 23.3M | if (PyTuple_Check(args)) { |
945 | 5.59M | ctx.arglen = PyTuple_Size(args); |
946 | 5.59M | ctx.argidx = 0; |
947 | 5.59M | } |
948 | 17.7M | else { |
949 | 17.7M | ctx.arglen = -1; |
950 | 17.7M | ctx.argidx = -2; |
951 | 17.7M | } |
952 | 23.3M | ctx.args_owned = 0; |
953 | 23.3M | if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) |
954 | 11.1k | ctx.dict = args; |
955 | 23.3M | else |
956 | 23.3M | ctx.dict = NULL; |
957 | 23.3M | ctx.args = args; |
958 | | |
959 | 111M | while (--ctx.fmtcnt >= 0) { |
960 | 92.4M | if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { |
961 | 46.9M | Py_ssize_t nonfmtpos; |
962 | | |
963 | 46.9M | nonfmtpos = ctx.fmtpos++; |
964 | 458M | while (ctx.fmtcnt >= 0 && |
965 | 446M | PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { |
966 | 411M | ctx.fmtpos++; |
967 | 411M | ctx.fmtcnt--; |
968 | 411M | } |
969 | 46.9M | if (ctx.fmtcnt < 0) { |
970 | 12.4M | ctx.fmtpos--; |
971 | 12.4M | ctx.writer.overallocate = 0; |
972 | 12.4M | } |
973 | | |
974 | 46.9M | if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr, |
975 | 46.9M | nonfmtpos, ctx.fmtpos) < 0) |
976 | 0 | goto onError; |
977 | 46.9M | } |
978 | 45.4M | else { |
979 | 45.4M | ctx.fmtpos++; |
980 | 45.4M | if (unicode_format_arg(&ctx) == -1) |
981 | 4.23M | goto onError; |
982 | 45.4M | } |
983 | 92.4M | } |
984 | | |
985 | 19.1M | if (ctx.argidx < ctx.arglen && !ctx.dict) { |
986 | 0 | PyErr_SetString(PyExc_TypeError, |
987 | 0 | "not all arguments converted during string formatting"); |
988 | 0 | goto onError; |
989 | 0 | } |
990 | | |
991 | 19.1M | if (ctx.args_owned) { |
992 | 11.1k | Py_DECREF(ctx.args); |
993 | 11.1k | } |
994 | 19.1M | return _PyUnicodeWriter_Finish(&ctx.writer); |
995 | | |
996 | 4.23M | onError: |
997 | 4.23M | _PyUnicodeWriter_Dealloc(&ctx.writer); |
998 | 4.23M | if (ctx.args_owned) { |
999 | 0 | Py_DECREF(ctx.args); |
1000 | 0 | } |
1001 | | return NULL; |
1002 | 19.1M | } |