Coverage Report

Created: 2026-02-09 07:07

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Objects/unicode_format.c
Line
Count
Source
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
// PyUnicode_Format() implementation
42
43
#include "Python.h"
44
#include "pycore_abstract.h"      // _PyIndex_Check()
45
#include "pycore_format.h"        // F_ALT
46
#include "pycore_long.h"          // _PyLong_FormatWriter()
47
#include "pycore_object.h"        // _PyObject_IsUniquelyReferenced()
48
#include "pycore_unicodeobject.h" // _Py_MAX_UNICODE
49
50
51
0
#define MAX_UNICODE _Py_MAX_UNICODE
52
19.0M
#define ensure_unicode _PyUnicode_EnsureUnicode
53
54
struct unicode_formatter_t {
55
    PyObject *args;
56
    int args_owned;
57
    Py_ssize_t arglen, argidx;
58
    PyObject *dict;
59
60
    int fmtkind;
61
    Py_ssize_t fmtcnt, fmtpos;
62
    const void *fmtdata;
63
    PyObject *fmtstr;
64
65
    _PyUnicodeWriter writer;
66
};
67
68
69
struct unicode_format_arg_t {
70
    Py_UCS4 ch;
71
    int flags;
72
    Py_ssize_t width;
73
    int prec;
74
    int sign;
75
    Py_ssize_t fmtstart;
76
    PyObject *key;
77
};
78
79
80
// Use FORMAT_ERROR("...%s", "") when there is no arguments.
81
2.24M
#define FORMAT_ERROR(EXC, FMT, ...) do {                                    \
82
2.24M
    if (arg->key != NULL) {                                                 \
83
0
        PyErr_Format((EXC), "format argument %R: " FMT,                     \
84
0
                     arg->key, __VA_ARGS__);                                \
85
0
    }                                                                       \
86
2.24M
    else if (ctx->argidx >= 0) {                                            \
87
0
        PyErr_Format((EXC), "format argument %zd: " FMT,                    \
88
0
                     ctx->argidx, __VA_ARGS__);                             \
89
0
    }                                                                       \
90
2.24M
    else {                                                                  \
91
2.24M
        PyErr_Format((EXC), "format argument: " FMT, __VA_ARGS__);          \
92
2.24M
    }                                                                       \
93
2.24M
} while (0)
94
95
96
static PyObject *
97
unicode_format_getnextarg(struct unicode_formatter_t *ctx, int allowone)
98
38.5M
{
99
38.5M
    Py_ssize_t argidx = ctx->argidx;
100
101
38.5M
    if (argidx < ctx->arglen && (allowone || ctx->arglen >= 0)) {
102
38.5M
        ctx->argidx++;
103
38.5M
        if (ctx->arglen >= 0) {
104
29.9M
            return PyTuple_GetItem(ctx->args, argidx);
105
29.9M
        }
106
8.66M
        else if (allowone) {
107
8.66M
            return ctx->args;
108
8.66M
        }
109
38.5M
    }
110
0
    PyErr_Format(PyExc_TypeError,
111
0
                 "not enough arguments for format string (got %zd)",
112
0
                 ctx->arglen < 0 ? 1 : ctx->arglen);
113
0
    return NULL;
114
38.5M
}
115
116
117
/* Returns a new reference to a PyUnicode object, or NULL on failure. */
118
119
/* Format a float into the writer if the writer is not NULL, or into *p_output
120
   otherwise.
121
122
   Return 0 on success, raise an exception and return -1 on error. */
123
static int
124
formatfloat(PyObject *v,
125
            struct unicode_formatter_t *ctx,
126
            struct unicode_format_arg_t *arg,
127
            PyObject **p_output,
128
            _PyUnicodeWriter *writer)
129
96
{
130
96
    char *p;
131
96
    double x;
132
96
    Py_ssize_t len;
133
96
    int prec;
134
96
    int dtoa_flags = 0;
135
136
96
    x = PyFloat_AsDouble(v);
137
96
    if (x == -1.0 && PyErr_Occurred()) {
138
0
        if (PyErr_ExceptionMatches(PyExc_TypeError)) {
139
0
            FORMAT_ERROR(PyExc_TypeError,
140
0
                         "%%%c requires a real number, not %T",
141
0
                         arg->ch, v);
142
0
        }
143
0
        return -1;
144
0
    }
145
146
96
    prec = arg->prec;
147
96
    if (prec < 0)
148
0
        prec = 6;
149
150
96
    if (arg->flags & F_ALT)
151
0
        dtoa_flags |= Py_DTSF_ALT;
152
96
    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
153
96
    if (p == NULL)
154
0
        return -1;
155
96
    len = strlen(p);
156
96
    if (writer) {
157
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
158
0
            PyMem_Free(p);
159
0
            return -1;
160
0
        }
161
0
    }
162
96
    else
163
96
        *p_output = _PyUnicode_FromASCII(p, len);
164
96
    PyMem_Free(p);
165
96
    return 0;
166
96
}
167
168
169
/* formatlong() emulates the format codes d, u, o, x and X, and
170
 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
171
 * Python's regular ints.
172
 * Return value:  a new PyUnicodeObject*, or NULL if error.
173
 *     The output string is of the form
174
 *         "-"? ("0x" | "0X")? digit+
175
 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
176
 *         set in flags.  The case of hex digits will be correct,
177
 *     There will be at least prec digits, zero-filled on the left if
178
 *         necessary to get that many.
179
 * val          object to be converted
180
 * flags        bitmask of format flags; only F_ALT is looked at
181
 * prec         minimum number of digits; 0-fill on left if needed
182
 * type         a character in [duoxX]; u acts the same as d
183
 *
184
 * CAUTION:  o, x and X conversions on regular ints can never
185
 * produce a '-' sign, but can for Python's unbounded ints.
186
 */
187
PyObject *
188
_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
189
7.17M
{
190
7.17M
    PyObject *result = NULL;
191
7.17M
    char *buf;
192
7.17M
    Py_ssize_t i;
193
7.17M
    int sign;           /* 1 if '-', else 0 */
194
7.17M
    int len;            /* number of characters */
195
7.17M
    Py_ssize_t llen;
196
7.17M
    int numdigits;      /* len == numnondigits + numdigits */
197
7.17M
    int numnondigits = 0;
198
199
    /* Avoid exceeding SSIZE_T_MAX */
200
7.17M
    if (prec > INT_MAX-3) {
201
0
        PyErr_SetString(PyExc_OverflowError,
202
0
                        "precision too large");
203
0
        return NULL;
204
0
    }
205
206
7.17M
    assert(PyLong_Check(val));
207
208
7.17M
    switch (type) {
209
0
    default:
210
0
        Py_UNREACHABLE();
211
3.18M
    case 'd':
212
3.18M
    case 'i':
213
3.18M
    case 'u':
214
        /* int and int subclasses should print numerically when a numeric */
215
        /* format code is used (see issue18780) */
216
3.18M
        result = PyNumber_ToBase(val, 10);
217
3.18M
        break;
218
0
    case 'o':
219
0
        numnondigits = 2;
220
0
        result = PyNumber_ToBase(val, 8);
221
0
        break;
222
82
    case 'x':
223
3.98M
    case 'X':
224
3.98M
        numnondigits = 2;
225
3.98M
        result = PyNumber_ToBase(val, 16);
226
3.98M
        break;
227
7.17M
    }
228
7.17M
    if (!result)
229
0
        return NULL;
230
231
7.17M
    assert(_PyUnicode_IsModifiable(result));
232
7.17M
    assert(PyUnicode_IS_ASCII(result));
233
234
    /* To modify the string in-place, there can only be one reference. */
235
7.17M
    if (!_PyObject_IsUniquelyReferenced(result)) {
236
0
        Py_DECREF(result);
237
0
        PyErr_BadInternalCall();
238
0
        return NULL;
239
0
    }
240
7.17M
    buf = PyUnicode_DATA(result);
241
7.17M
    llen = PyUnicode_GET_LENGTH(result);
242
7.17M
    if (llen > INT_MAX) {
243
0
        Py_DECREF(result);
244
0
        PyErr_SetString(PyExc_ValueError,
245
0
                        "string too large in _PyUnicode_FormatLong");
246
0
        return NULL;
247
0
    }
248
7.17M
    len = (int)llen;
249
7.17M
    sign = buf[0] == '-';
250
7.17M
    numnondigits += sign;
251
7.17M
    numdigits = len - numnondigits;
252
7.17M
    assert(numdigits > 0);
253
254
    /* Get rid of base marker unless F_ALT */
255
7.17M
    if (((alt) == 0 &&
256
7.17M
        (type == 'o' || type == 'x' || type == 'X'))) {
257
3.98M
        assert(buf[sign] == '0');
258
3.98M
        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
259
3.98M
               buf[sign+1] == 'o');
260
3.98M
        numnondigits -= 2;
261
3.98M
        buf += 2;
262
3.98M
        len -= 2;
263
3.98M
        if (sign)
264
0
            buf[0] = '-';
265
3.98M
        assert(len == numnondigits + numdigits);
266
3.98M
        assert(numdigits > 0);
267
3.98M
    }
268
269
    /* Fill with leading zeroes to meet minimum width. */
270
7.17M
    if (prec > numdigits) {
271
0
        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
272
0
                                numnondigits + prec);
273
0
        char *b1;
274
0
        if (!r1) {
275
0
            Py_DECREF(result);
276
0
            return NULL;
277
0
        }
278
0
        b1 = PyBytes_AS_STRING(r1);
279
0
        for (i = 0; i < numnondigits; ++i)
280
0
            *b1++ = *buf++;
281
0
        for (i = 0; i < prec - numdigits; i++)
282
0
            *b1++ = '0';
283
0
        for (i = 0; i < numdigits; i++)
284
0
            *b1++ = *buf++;
285
0
        *b1 = '\0';
286
0
        Py_SETREF(result, r1);
287
0
        buf = PyBytes_AS_STRING(result);
288
0
        len = numnondigits + prec;
289
0
    }
290
291
    /* Fix up case for hex conversions. */
292
7.17M
    if (type == 'X') {
293
        /* Need to convert all lower case letters to upper case.
294
           and need to convert 0x to 0X (and -0x to -0X). */
295
26.8M
        for (i = 0; i < len; i++)
296
22.8M
            if (buf[i] >= 'a' && buf[i] <= 'x')
297
4.67M
                buf[i] -= 'a'-'A';
298
3.98M
    }
299
7.17M
    if (!PyUnicode_Check(result)
300
7.17M
        || buf != PyUnicode_DATA(result)) {
301
3.98M
        PyObject *unicode;
302
3.98M
        unicode = _PyUnicode_FromASCII(buf, len);
303
3.98M
        Py_SETREF(result, unicode);
304
3.98M
    }
305
3.18M
    else if (len != PyUnicode_GET_LENGTH(result)) {
306
0
        if (PyUnicode_Resize(&result, len) < 0)
307
0
            Py_CLEAR(result);
308
0
    }
309
7.17M
    return result;
310
7.17M
}
311
312
313
/* Format an integer or a float as an integer.
314
 * Return 1 if the number has been formatted into the writer,
315
 *        0 if the number has been formatted into *p_output
316
 *       -1 and raise an exception on error */
317
static int
318
mainformatlong(PyObject *v,
319
               struct unicode_formatter_t *ctx,
320
               struct unicode_format_arg_t *arg,
321
               PyObject **p_output,
322
               _PyUnicodeWriter *writer)
323
13.4M
{
324
13.4M
    PyObject *iobj, *res;
325
13.4M
    char type = (char)arg->ch;
326
327
13.4M
    if (!PyNumber_Check(v))
328
2.24M
        goto wrongtype;
329
330
    /* make sure number is a type of integer for o, x, and X */
331
11.2M
    if (!PyLong_Check(v)) {
332
0
        if (type == 'o' || type == 'x' || type == 'X') {
333
0
            iobj = _PyNumber_Index(v);
334
0
        }
335
0
        else {
336
0
            iobj = PyNumber_Long(v);
337
0
        }
338
0
        if (iobj == NULL ) {
339
0
            if (PyErr_ExceptionMatches(PyExc_TypeError))
340
0
                goto wrongtype;
341
0
            return -1;
342
0
        }
343
0
        assert(PyLong_Check(iobj));
344
0
    }
345
11.2M
    else {
346
11.2M
        iobj = Py_NewRef(v);
347
11.2M
    }
348
349
11.2M
    if (PyLong_CheckExact(v)
350
11.2M
        && arg->width == -1 && arg->prec == -1
351
8.05M
        && !(arg->flags & (F_SIGN | F_BLANK))
352
8.05M
        && type != 'X')
353
4.06M
    {
354
        /* Fast path */
355
4.06M
        int alternate = arg->flags & F_ALT;
356
4.06M
        int base;
357
358
4.06M
        switch(type)
359
4.06M
        {
360
0
            default:
361
0
                Py_UNREACHABLE();
362
4.06M
            case 'd':
363
4.06M
            case 'i':
364
4.06M
            case 'u':
365
4.06M
                base = 10;
366
4.06M
                break;
367
0
            case 'o':
368
0
                base = 8;
369
0
                break;
370
27
            case 'x':
371
27
            case 'X':
372
27
                base = 16;
373
27
                break;
374
4.06M
        }
375
376
4.06M
        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
377
0
            Py_DECREF(iobj);
378
0
            return -1;
379
0
        }
380
4.06M
        Py_DECREF(iobj);
381
4.06M
        return 1;
382
4.06M
    }
383
384
7.17M
    res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
385
7.17M
    Py_DECREF(iobj);
386
7.17M
    if (res == NULL)
387
0
        return -1;
388
7.17M
    *p_output = res;
389
7.17M
    return 0;
390
391
2.24M
wrongtype:
392
2.24M
    switch(type)
393
2.24M
    {
394
0
        case 'o':
395
0
        case 'x':
396
0
        case 'X':
397
0
            FORMAT_ERROR(PyExc_TypeError,
398
0
                         "%%%c requires an integer, not %T",
399
0
                         arg->ch, v);
400
0
            break;
401
2.24M
        default:
402
2.24M
            FORMAT_ERROR(PyExc_TypeError,
403
2.24M
                         "%%%c requires a real number, not %T",
404
2.24M
                         arg->ch, v);
405
2.24M
            break;
406
2.24M
    }
407
2.24M
    return -1;
408
2.24M
}
409
410
411
static Py_UCS4
412
formatchar(PyObject *v,
413
           struct unicode_formatter_t *ctx,
414
           struct unicode_format_arg_t *arg)
415
0
{
416
    /* presume that the buffer is at least 3 characters long */
417
0
    if (PyUnicode_Check(v)) {
418
0
        if (PyUnicode_GET_LENGTH(v) == 1) {
419
0
            return PyUnicode_READ_CHAR(v, 0);
420
0
        }
421
0
        FORMAT_ERROR(PyExc_TypeError,
422
0
                     "%%c requires an integer or a unicode character, "
423
0
                     "not a string of length %zd",
424
0
                     PyUnicode_GET_LENGTH(v));
425
0
        return (Py_UCS4) -1;
426
0
    }
427
0
    else {
428
0
        int overflow;
429
0
        long x = PyLong_AsLongAndOverflow(v, &overflow);
430
0
        if (x == -1 && PyErr_Occurred()) {
431
0
            if (PyErr_ExceptionMatches(PyExc_TypeError)) {
432
0
                FORMAT_ERROR(PyExc_TypeError,
433
0
                             "%%c requires an integer or a unicode character, "
434
0
                             "not %T",
435
0
                             v);
436
0
            }
437
0
            return (Py_UCS4) -1;
438
0
        }
439
440
0
        if (x < 0 || x > MAX_UNICODE) {
441
            /* this includes an overflow in converting to C long */
442
0
            FORMAT_ERROR(PyExc_OverflowError,
443
0
                         "%%c argument not in range(0x110000)%s", "");
444
0
            return (Py_UCS4) -1;
445
0
        }
446
447
0
        return (Py_UCS4) x;
448
0
    }
449
0
}
450
451
452
/* Parse options of an argument: flags, width, precision.
453
   Handle also "%(name)" syntax.
454
455
   Return 0 if the argument has been formatted into arg->str.
456
   Return 1 if the argument has been written into ctx->writer,
457
   Raise an exception and return -1 on error. */
458
static int
459
unicode_format_arg_parse(struct unicode_formatter_t *ctx,
460
                         struct unicode_format_arg_t *arg)
461
38.5M
{
462
38.5M
#define FORMAT_READ(ctx) \
463
42.0M
        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
464
465
38.5M
    PyObject *v;
466
467
38.5M
    if (arg->ch == '(') {
468
        /* Get argument value from a dictionary. Example: "%(name)s". */
469
38.1k
        Py_ssize_t keystart;
470
38.1k
        Py_ssize_t keylen;
471
38.1k
        int pcount = 1;
472
473
38.1k
        if (ctx->dict == NULL) {
474
0
            PyErr_Format(PyExc_TypeError,
475
0
                         "format requires a mapping, not %T",
476
0
                         ctx->args);
477
0
            return -1;
478
0
        }
479
38.1k
        ++ctx->fmtpos;
480
38.1k
        --ctx->fmtcnt;
481
38.1k
        keystart = ctx->fmtpos;
482
        /* Skip over balanced parentheses */
483
343k
        while (pcount > 0 && --ctx->fmtcnt >= 0) {
484
304k
            arg->ch = FORMAT_READ(ctx);
485
304k
            if (arg->ch == ')')
486
38.1k
                --pcount;
487
266k
            else if (arg->ch == '(')
488
0
                ++pcount;
489
304k
            ctx->fmtpos++;
490
304k
        }
491
38.1k
        keylen = ctx->fmtpos - keystart - 1;
492
38.1k
        if (ctx->fmtcnt < 0 || pcount > 0) {
493
0
            PyErr_Format(PyExc_ValueError,
494
0
                         "stray %% or incomplete format key at position %zd",
495
0
                         arg->fmtstart);
496
0
            return -1;
497
0
        }
498
38.1k
        arg->key = PyUnicode_Substring(ctx->fmtstr,
499
38.1k
                                       keystart, keystart + keylen);
500
38.1k
        if (arg->key == NULL)
501
0
            return -1;
502
38.1k
        if (ctx->args_owned) {
503
27.2k
            ctx->args_owned = 0;
504
27.2k
            Py_DECREF(ctx->args);
505
27.2k
        }
506
38.1k
        ctx->args = PyObject_GetItem(ctx->dict, arg->key);
507
38.1k
        if (ctx->args == NULL)
508
0
            return -1;
509
38.1k
        ctx->args_owned = 1;
510
38.1k
        ctx->arglen = -3;
511
38.1k
        ctx->argidx = -4;
512
38.1k
    }
513
38.5M
    else {
514
38.5M
        if (ctx->arglen < -1) {
515
0
            PyErr_Format(PyExc_ValueError,
516
0
                         "format requires a parenthesised mapping key "
517
0
                         "at position %zd",
518
0
                         arg->fmtstart);
519
0
            return -1;
520
0
        }
521
38.5M
    }
522
523
    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
524
38.5M
    while (--ctx->fmtcnt >= 0) {
525
38.5M
        arg->ch = FORMAT_READ(ctx);
526
38.5M
        ctx->fmtpos++;
527
38.5M
        switch (arg->ch) {
528
0
        case '-': arg->flags |= F_LJUST; continue;
529
0
        case '+': arg->flags |= F_SIGN; continue;
530
0
        case ' ': arg->flags |= F_BLANK; continue;
531
27
        case '#': arg->flags |= F_ALT; continue;
532
2.15k
        case '0': arg->flags |= F_ZERO; continue;
533
38.5M
        }
534
38.5M
        break;
535
38.5M
    }
536
537
    /* Parse width. Example: "%10s" => width=10 */
538
38.5M
    if (arg->ch == '*') {
539
0
        if (ctx->arglen < -1) {
540
0
            PyErr_Format(PyExc_ValueError,
541
0
                    "* cannot be used with a parenthesised mapping key "
542
0
                    "at position %zd",
543
0
                    arg->fmtstart);
544
0
            return -1;
545
0
        }
546
0
        v = unicode_format_getnextarg(ctx, 0);
547
0
        if (v == NULL)
548
0
            return -1;
549
0
        if (!PyLong_Check(v)) {
550
0
            FORMAT_ERROR(PyExc_TypeError, "* requires int, not %T", v);
551
0
            return -1;
552
0
        }
553
0
        arg->width = PyLong_AsSsize_t(v);
554
0
        if (arg->width == -1 && PyErr_Occurred()) {
555
0
            if (PyErr_ExceptionMatches(PyExc_OverflowError)) {
556
0
                FORMAT_ERROR(PyExc_OverflowError,
557
0
                             "too big for width%s", "");
558
0
            }
559
0
            return -1;
560
0
        }
561
0
        if (arg->width < 0) {
562
0
            arg->flags |= F_LJUST;
563
0
            arg->width = -arg->width;
564
0
        }
565
0
        if (--ctx->fmtcnt >= 0) {
566
0
            arg->ch = FORMAT_READ(ctx);
567
0
            ctx->fmtpos++;
568
0
        }
569
0
    }
570
38.5M
    else if (arg->ch >= '0' && arg->ch <= '9') {
571
3.19M
        arg->width = arg->ch - '0';
572
3.19M
        while (--ctx->fmtcnt >= 0) {
573
3.19M
            arg->ch = FORMAT_READ(ctx);
574
3.19M
            ctx->fmtpos++;
575
3.19M
            if (arg->ch < '0' || arg->ch > '9')
576
3.19M
                break;
577
            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
578
               mixing signed and unsigned comparison. Since arg->ch is between
579
               '0' and '9', casting to int is safe. */
580
0
            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
581
0
                PyErr_Format(PyExc_ValueError,
582
0
                             "width too big at position %zd",
583
0
                             arg->fmtstart);
584
0
                return -1;
585
0
            }
586
0
            arg->width = arg->width*10 + (arg->ch - '0');
587
0
        }
588
3.19M
    }
589
590
    /* Parse precision. Example: "%.3f" => prec=3 */
591
38.5M
    if (arg->ch == '.') {
592
96
        arg->prec = 0;
593
96
        if (--ctx->fmtcnt >= 0) {
594
96
            arg->ch = FORMAT_READ(ctx);
595
96
            ctx->fmtpos++;
596
96
        }
597
96
        if (arg->ch == '*') {
598
0
            if (ctx->arglen < -1) {
599
0
                PyErr_Format(PyExc_ValueError,
600
0
                        "* cannot be used with a parenthesised mapping key "
601
0
                        "at position %zd",
602
0
                        arg->fmtstart);
603
0
                return -1;
604
0
            }
605
0
            v = unicode_format_getnextarg(ctx, 0);
606
0
            if (v == NULL)
607
0
                return -1;
608
0
            if (!PyLong_Check(v)) {
609
0
                FORMAT_ERROR(PyExc_TypeError, "* requires int, not %T", v);
610
0
                return -1;
611
0
            }
612
0
            arg->prec = PyLong_AsInt(v);
613
0
            if (arg->prec == -1 && PyErr_Occurred()) {
614
0
                if (PyErr_ExceptionMatches(PyExc_OverflowError)) {
615
0
                    FORMAT_ERROR(PyExc_OverflowError,
616
0
                                 "too big for precision%s", "");
617
0
                }
618
0
                return -1;
619
0
            }
620
0
            if (arg->prec < 0)
621
0
                arg->prec = 0;
622
0
            if (--ctx->fmtcnt >= 0) {
623
0
                arg->ch = FORMAT_READ(ctx);
624
0
                ctx->fmtpos++;
625
0
            }
626
0
        }
627
96
        else if (arg->ch >= '0' && arg->ch <= '9') {
628
96
            arg->prec = arg->ch - '0';
629
96
            while (--ctx->fmtcnt >= 0) {
630
96
                arg->ch = FORMAT_READ(ctx);
631
96
                ctx->fmtpos++;
632
96
                if (arg->ch < '0' || arg->ch > '9')
633
96
                    break;
634
0
                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
635
0
                    PyErr_Format(PyExc_ValueError,
636
0
                                 "precision too big at position %zd",
637
0
                                 arg->fmtstart);
638
0
                    return -1;
639
0
                }
640
0
                arg->prec = arg->prec*10 + (arg->ch - '0');
641
0
            }
642
96
        }
643
96
    }
644
645
    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
646
38.5M
    if (ctx->fmtcnt >= 0) {
647
38.5M
        if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
648
0
            if (--ctx->fmtcnt >= 0) {
649
0
                arg->ch = FORMAT_READ(ctx);
650
0
                ctx->fmtpos++;
651
0
            }
652
0
        }
653
38.5M
    }
654
38.5M
    if (ctx->fmtcnt < 0) {
655
0
        PyErr_Format(PyExc_ValueError,
656
0
                     "stray %% at position %zd", arg->fmtstart);
657
0
        return -1;
658
0
    }
659
38.5M
    return 0;
660
661
38.5M
#undef FORMAT_READ
662
38.5M
}
663
664
665
/* Format one argument. Supported conversion specifiers:
666
667
   - "s", "r", "a": any type
668
   - "i", "d", "u": int or float
669
   - "o", "x", "X": int
670
   - "e", "E", "f", "F", "g", "G": float
671
   - "c": int or str (1 character)
672
673
   When possible, the output is written directly into the Unicode writer
674
   (ctx->writer). A string is created when padding is required.
675
676
   Return 0 if the argument has been formatted into *p_str,
677
          1 if the argument has been written into ctx->writer,
678
         -1 on error. */
679
static int
680
unicode_format_arg_format(struct unicode_formatter_t *ctx,
681
                          struct unicode_format_arg_t *arg,
682
                          PyObject **p_str)
683
38.5M
{
684
38.5M
    PyObject *v;
685
38.5M
    _PyUnicodeWriter *writer = &ctx->writer;
686
687
38.5M
    if (ctx->fmtcnt == 0)
688
13.3M
        ctx->writer.overallocate = 0;
689
690
38.5M
    v = unicode_format_getnextarg(ctx, 1);
691
38.5M
    if (v == NULL)
692
0
        return -1;
693
694
695
38.5M
    switch (arg->ch) {
696
25.0M
    case 's':
697
25.0M
    case 'r':
698
25.0M
    case 'a':
699
25.0M
        if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
700
            /* Fast path */
701
99
            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
702
0
                return -1;
703
99
            return 1;
704
99
        }
705
706
25.0M
        if (PyUnicode_CheckExact(v) && arg->ch == 's') {
707
21.1M
            *p_str = Py_NewRef(v);
708
21.1M
        }
709
3.98M
        else {
710
3.98M
            if (arg->ch == 's')
711
3.98M
                *p_str = PyObject_Str(v);
712
6.16k
            else if (arg->ch == 'r')
713
6.16k
                *p_str = PyObject_Repr(v);
714
0
            else
715
0
                *p_str = PyObject_ASCII(v);
716
3.98M
        }
717
25.0M
        break;
718
719
0
    case 'i':
720
9.50M
    case 'd':
721
9.50M
    case 'u':
722
9.50M
    case 'o':
723
9.50M
    case 'x':
724
13.4M
    case 'X':
725
13.4M
    {
726
13.4M
        int ret = mainformatlong(v, ctx, arg, p_str, writer);
727
13.4M
        if (ret != 0)
728
6.31M
            return ret;
729
7.17M
        arg->sign = 1;
730
7.17M
        break;
731
13.4M
    }
732
733
0
    case 'e':
734
0
    case 'E':
735
96
    case 'f':
736
96
    case 'F':
737
96
    case 'g':
738
96
    case 'G':
739
96
        if (arg->width == -1 && arg->prec == -1
740
0
            && !(arg->flags & (F_SIGN | F_BLANK)))
741
0
        {
742
            /* Fast path */
743
0
            if (formatfloat(v, ctx, arg, NULL, writer) == -1)
744
0
                return -1;
745
0
            return 1;
746
0
        }
747
748
96
        arg->sign = 1;
749
96
        if (formatfloat(v, ctx, arg, p_str, NULL) == -1)
750
0
            return -1;
751
96
        break;
752
753
96
    case 'c':
754
0
    {
755
0
        Py_UCS4 ch = formatchar(v, ctx, arg);
756
0
        if (ch == (Py_UCS4) -1)
757
0
            return -1;
758
0
        if (arg->width == -1 && arg->prec == -1) {
759
            /* Fast path */
760
0
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
761
0
                return -1;
762
0
            return 1;
763
0
        }
764
0
        *p_str = PyUnicode_FromOrdinal(ch);
765
0
        break;
766
0
    }
767
768
0
    default:
769
0
        if (arg->ch < 128 && Py_ISALPHA(arg->ch)) {
770
0
            PyErr_Format(PyExc_ValueError,
771
0
                         "unsupported format %%%c at position %zd",
772
0
                         (int)arg->ch, arg->fmtstart);
773
0
        }
774
0
        else if (arg->ch == '\'') {
775
0
            PyErr_Format(PyExc_ValueError,
776
0
                         "stray %% at position %zd or unexpected "
777
0
                         "format character \"'\" at position %zd",
778
0
                         arg->fmtstart,
779
0
                         ctx->fmtpos - 1);
780
0
        }
781
0
        else if (arg->ch >= 32 && arg->ch < 127) {
782
0
            PyErr_Format(PyExc_ValueError,
783
0
                         "stray %% at position %zd or unexpected "
784
0
                         "format character '%c' at position %zd",
785
0
                         arg->fmtstart,
786
0
                         (int)arg->ch, ctx->fmtpos - 1);
787
0
        }
788
0
        else if (Py_UNICODE_ISPRINTABLE(arg->ch)) {
789
0
            PyErr_Format(PyExc_ValueError,
790
0
                         "stray %% at position %zd or unexpected "
791
0
                         "format character '%c' (U+%04X) at position %zd",
792
0
                         arg->fmtstart,
793
0
                         (int)arg->ch, (int)arg->ch, ctx->fmtpos - 1);
794
0
        }
795
0
        else {
796
0
            PyErr_Format(PyExc_ValueError,
797
0
                         "stray %% at position %zd or unexpected "
798
0
                         "format character U+%04X at position %zd",
799
0
                         arg->fmtstart, (int)arg->ch, ctx->fmtpos - 1);
800
0
        }
801
0
        return -1;
802
38.5M
    }
803
32.2M
    if (*p_str == NULL)
804
0
        return -1;
805
32.2M
    assert (PyUnicode_Check(*p_str));
806
32.2M
    return 0;
807
32.2M
}
808
809
810
static int
811
unicode_format_arg_output(struct unicode_formatter_t *ctx,
812
                          struct unicode_format_arg_t *arg,
813
                          PyObject *str)
814
32.2M
{
815
32.2M
    Py_ssize_t len;
816
32.2M
    int kind;
817
32.2M
    const void *pbuf;
818
32.2M
    Py_ssize_t pindex;
819
32.2M
    Py_UCS4 signchar;
820
32.2M
    Py_ssize_t buflen;
821
32.2M
    Py_UCS4 maxchar;
822
32.2M
    Py_ssize_t sublen;
823
32.2M
    _PyUnicodeWriter *writer = &ctx->writer;
824
32.2M
    Py_UCS4 fill;
825
826
32.2M
    fill = ' ';
827
32.2M
    if (arg->sign && arg->flags & F_ZERO)
828
2.15k
        fill = '0';
829
830
32.2M
    len = PyUnicode_GET_LENGTH(str);
831
32.2M
    if ((arg->width == -1 || arg->width <= len)
832
32.2M
        && (arg->prec == -1 || arg->prec >= len)
833
32.2M
        && !(arg->flags & (F_SIGN | F_BLANK)))
834
32.2M
    {
835
        /* Fast path */
836
32.2M
        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
837
0
            return -1;
838
32.2M
        return 0;
839
32.2M
    }
840
841
    /* Truncate the string for "s", "r" and "a" formats
842
       if the precision is set */
843
16.9k
    if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
844
0
        if (arg->prec >= 0 && len > arg->prec)
845
0
            len = arg->prec;
846
0
    }
847
848
    /* Adjust sign and width */
849
16.9k
    kind = PyUnicode_KIND(str);
850
16.9k
    pbuf = PyUnicode_DATA(str);
851
16.9k
    pindex = 0;
852
16.9k
    signchar = '\0';
853
16.9k
    if (arg->sign) {
854
16.9k
        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
855
16.9k
        if (ch == '-' || ch == '+') {
856
0
            signchar = ch;
857
0
            len--;
858
0
            pindex++;
859
0
        }
860
16.9k
        else if (arg->flags & F_SIGN)
861
0
            signchar = '+';
862
16.9k
        else if (arg->flags & F_BLANK)
863
0
            signchar = ' ';
864
16.9k
        else
865
16.9k
            arg->sign = 0;
866
16.9k
    }
867
16.9k
    if (arg->width < len)
868
96
        arg->width = len;
869
870
    /* Prepare the writer */
871
16.9k
    maxchar = writer->maxchar;
872
16.9k
    if (!(arg->flags & F_LJUST)) {
873
16.9k
        if (arg->sign) {
874
0
            if ((arg->width-1) > len)
875
0
                maxchar = Py_MAX(maxchar, fill);
876
0
        }
877
16.9k
        else {
878
16.9k
            if (arg->width > len)
879
16.8k
                maxchar = Py_MAX(maxchar, fill);
880
16.9k
        }
881
16.9k
    }
882
16.9k
    if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
883
0
        Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
884
0
        maxchar = Py_MAX(maxchar, strmaxchar);
885
0
    }
886
887
16.9k
    buflen = arg->width;
888
16.9k
    if (arg->sign && len == arg->width)
889
0
        buflen++;
890
16.9k
    if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
891
0
        return -1;
892
893
    /* Write the sign if needed */
894
16.9k
    if (arg->sign) {
895
0
        if (fill != ' ') {
896
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
897
0
            writer->pos += 1;
898
0
        }
899
0
        if (arg->width > len)
900
0
            arg->width--;
901
0
    }
902
903
    /* Write the numeric prefix for "x", "X" and "o" formats
904
       if the alternate form is used.
905
       For example, write "0x" for the "%#x" format. */
906
16.9k
    if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
907
0
        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
908
0
        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
909
0
        if (fill != ' ') {
910
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
911
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
912
0
            writer->pos += 2;
913
0
            pindex += 2;
914
0
        }
915
0
        arg->width -= 2;
916
0
        if (arg->width < 0)
917
0
            arg->width = 0;
918
0
        len -= 2;
919
0
    }
920
921
    /* Pad left with the fill character if needed */
922
16.9k
    if (arg->width > len && !(arg->flags & F_LJUST)) {
923
16.8k
        sublen = arg->width - len;
924
16.8k
        _PyUnicode_Fill(writer->kind, writer->data, fill, writer->pos, sublen);
925
16.8k
        writer->pos += sublen;
926
16.8k
        arg->width = len;
927
16.8k
    }
928
929
    /* If padding with spaces: write sign if needed and/or numeric prefix if
930
       the alternate form is used */
931
16.9k
    if (fill == ' ') {
932
16.7k
        if (arg->sign) {
933
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
934
0
            writer->pos += 1;
935
0
        }
936
16.7k
        if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
937
0
            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
938
0
            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
939
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
940
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
941
0
            writer->pos += 2;
942
0
            pindex += 2;
943
0
        }
944
16.7k
    }
945
946
    /* Write characters */
947
16.9k
    if (len) {
948
16.9k
        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
949
16.9k
                                      str, pindex, len);
950
16.9k
        writer->pos += len;
951
16.9k
    }
952
953
    /* Pad right with the fill character if needed */
954
16.9k
    if (arg->width > len) {
955
0
        sublen = arg->width - len;
956
0
        _PyUnicode_Fill(writer->kind, writer->data, ' ', writer->pos, sublen);
957
0
        writer->pos += sublen;
958
0
    }
959
16.9k
    return 0;
960
16.9k
}
961
962
963
/* Helper of PyUnicode_Format(): format one arg.
964
   Return 0 on success, raise an exception and return -1 on error. */
965
static int
966
unicode_format_arg(struct unicode_formatter_t *ctx)
967
38.5M
{
968
38.5M
    struct unicode_format_arg_t arg;
969
38.5M
    PyObject *str;
970
38.5M
    int ret;
971
972
38.5M
    arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
973
38.5M
    if (arg.ch == '%') {
974
0
        ctx->fmtpos++;
975
0
        ctx->fmtcnt--;
976
0
        if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
977
0
            return -1;
978
0
        return 0;
979
0
    }
980
38.5M
    arg.flags = 0;
981
38.5M
    arg.width = -1;
982
38.5M
    arg.prec = -1;
983
38.5M
    arg.sign = 0;
984
38.5M
    arg.fmtstart = ctx->fmtpos - 1;
985
38.5M
    arg.key = NULL;
986
38.5M
    str = NULL;
987
988
38.5M
    ret = unicode_format_arg_parse(ctx, &arg);
989
38.5M
    if (ret == -1) {
990
0
        goto onError;
991
0
    }
992
993
38.5M
    ret = unicode_format_arg_format(ctx, &arg, &str);
994
38.5M
    if (ret == -1) {
995
2.24M
        goto onError;
996
2.24M
    }
997
998
36.3M
    if (ret != 1) {
999
32.2M
        ret = unicode_format_arg_output(ctx, &arg, str);
1000
32.2M
        Py_DECREF(str);
1001
32.2M
        if (ret == -1) {
1002
0
            goto onError;
1003
0
        }
1004
32.2M
    }
1005
1006
36.3M
    if (ctx->dict && (ctx->argidx < ctx->arglen)) {
1007
        // XXX: Never happens?
1008
0
        PyErr_SetString(PyExc_TypeError,
1009
0
                        "not all arguments converted during string formatting");
1010
0
        goto onError;
1011
0
    }
1012
36.3M
    Py_XDECREF(arg.key);
1013
36.3M
    return 0;
1014
1015
2.24M
  onError:
1016
2.24M
    Py_XDECREF(arg.key);
1017
2.24M
    return -1;
1018
36.3M
}
1019
1020
1021
PyObject *
1022
PyUnicode_Format(PyObject *format, PyObject *args)
1023
19.0M
{
1024
19.0M
    struct unicode_formatter_t ctx;
1025
1026
19.0M
    if (format == NULL || args == NULL) {
1027
0
        PyErr_BadInternalCall();
1028
0
        return NULL;
1029
0
    }
1030
1031
19.0M
    if (ensure_unicode(format) < 0)
1032
0
        return NULL;
1033
1034
19.0M
    ctx.fmtstr = format;
1035
19.0M
    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
1036
19.0M
    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
1037
19.0M
    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
1038
19.0M
    ctx.fmtpos = 0;
1039
1040
19.0M
    _PyUnicodeWriter_Init(&ctx.writer);
1041
19.0M
    ctx.writer.min_length = ctx.fmtcnt + 100;
1042
19.0M
    ctx.writer.overallocate = 1;
1043
1044
19.0M
    if (PyTuple_Check(args)) {
1045
10.4M
        ctx.arglen = PyTuple_Size(args);
1046
10.4M
        ctx.argidx = 0;
1047
10.4M
    }
1048
8.63M
    else {
1049
8.63M
        ctx.arglen = -1;
1050
8.63M
        ctx.argidx = -2;
1051
8.63M
    }
1052
19.0M
    ctx.args_owned = 0;
1053
19.0M
    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
1054
11.8k
        ctx.dict = args;
1055
19.0M
    else
1056
19.0M
        ctx.dict = NULL;
1057
19.0M
    ctx.args = args;
1058
1059
93.6M
    while (--ctx.fmtcnt >= 0) {
1060
76.7M
        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
1061
38.2M
            Py_ssize_t nonfmtpos;
1062
1063
38.2M
            nonfmtpos = ctx.fmtpos++;
1064
312M
            while (ctx.fmtcnt >= 0 &&
1065
306M
                   PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
1066
274M
                ctx.fmtpos++;
1067
274M
                ctx.fmtcnt--;
1068
274M
            }
1069
38.2M
            if (ctx.fmtcnt < 0) {
1070
5.68M
                ctx.fmtpos--;
1071
5.68M
                ctx.writer.overallocate = 0;
1072
5.68M
            }
1073
1074
38.2M
            if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
1075
38.2M
                                                nonfmtpos, ctx.fmtpos) < 0)
1076
0
                goto onError;
1077
38.2M
        }
1078
38.5M
        else {
1079
38.5M
            ctx.fmtpos++;
1080
38.5M
            if (unicode_format_arg(&ctx) == -1)
1081
2.24M
                goto onError;
1082
38.5M
        }
1083
76.7M
    }
1084
1085
16.8M
    if (ctx.argidx < ctx.arglen && !ctx.dict) {
1086
0
        PyErr_Format(PyExc_TypeError,
1087
0
                     "not all arguments converted during string formatting "
1088
0
                     "(required %zd, got %zd)",
1089
0
                     ctx.arglen < 0 ? 0 : ctx.argidx,
1090
0
                     ctx.arglen < 0 ? 1 : ctx.arglen);
1091
0
        goto onError;
1092
0
    }
1093
1094
16.8M
    if (ctx.args_owned) {
1095
10.8k
        Py_DECREF(ctx.args);
1096
10.8k
    }
1097
16.8M
    return _PyUnicodeWriter_Finish(&ctx.writer);
1098
1099
2.24M
  onError:
1100
2.24M
    _PyUnicodeWriter_Dealloc(&ctx.writer);
1101
2.24M
    if (ctx.args_owned) {
1102
0
        Py_DECREF(ctx.args);
1103
0
    }
1104
    return NULL;
1105
16.8M
}