Coverage Report

Created: 2026-05-30 06:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Objects/stringlib/unicode_format.h
Line
Count
Source
1
/*
2
    unicode_format.h -- implementation of str.format().
3
*/
4
5
#include "pycore_complexobject.h" // _PyComplex_FormatAdvancedWriter()
6
#include "pycore_floatobject.h"   // _PyFloat_FormatAdvancedWriter()
7
#include "pycore_tuple.h"         // _PyTuple_FromPairSteal
8
9
/************************************************************************/
10
/***********   Global data structures and forward declarations  *********/
11
/************************************************************************/
12
13
/*
14
   A SubString consists of the characters between two string or
15
   unicode pointers.
16
*/
17
typedef struct {
18
    PyObject *str; /* borrowed reference */
19
    Py_ssize_t start, end;
20
} SubString;
21
22
23
typedef enum {
24
    ANS_INIT,
25
    ANS_AUTO,
26
    ANS_MANUAL
27
} AutoNumberState;   /* Keep track if we're auto-numbering fields */
28
29
/* Keeps track of our auto-numbering state, and which number field we're on */
30
typedef struct {
31
    AutoNumberState an_state;
32
    int an_field_number;
33
} AutoNumber;
34
35
36
/* forward declaration for recursion */
37
static PyObject *
38
build_string(SubString *input, PyObject *args, PyObject *kwargs,
39
             int recursion_depth, AutoNumber *auto_number);
40
41
42
43
/************************************************************************/
44
/**************************  Utility  functions  ************************/
45
/************************************************************************/
46
47
static void
48
AutoNumber_Init(AutoNumber *auto_number)
49
9.23M
{
50
9.23M
    auto_number->an_state = ANS_INIT;
51
9.23M
    auto_number->an_field_number = 0;
52
9.23M
}
53
54
/* fill in a SubString from a pointer and length */
55
Py_LOCAL_INLINE(void)
56
SubString_init(SubString *str, PyObject *s, Py_ssize_t start, Py_ssize_t end)
57
175M
{
58
175M
    str->str = s;
59
175M
    str->start = start;
60
175M
    str->end = end;
61
175M
}
62
63
/* return a new string.  if str->str is NULL, return None */
64
Py_LOCAL_INLINE(PyObject *)
65
SubString_new_object(SubString *str)
66
45.0k
{
67
45.0k
    if (str->str == NULL)
68
0
        Py_RETURN_NONE;
69
45.0k
    return PyUnicode_Substring(str->str, str->start, str->end);
70
45.0k
}
71
72
/* return a new string.  if str->str is NULL, return a new empty string */
73
Py_LOCAL_INLINE(PyObject *)
74
SubString_new_object_or_empty(SubString *str)
75
0
{
76
0
    if (str->str == NULL) {
77
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_STR);
78
0
    }
79
0
    return SubString_new_object(str);
80
0
}
81
82
/* Return 1 if an error has been detected switching between automatic
83
   field numbering and manual field specification, else return 0. Set
84
   ValueError on error. */
85
static int
86
autonumber_state_error(AutoNumberState state, int field_name_is_empty)
87
17.3M
{
88
17.3M
    if (state == ANS_MANUAL) {
89
432
        if (field_name_is_empty) {
90
0
            PyErr_SetString(PyExc_ValueError, "cannot switch from "
91
0
                            "manual field specification to "
92
0
                            "automatic field numbering");
93
0
            return 1;
94
0
        }
95
432
    }
96
17.3M
    else {
97
17.3M
        if (!field_name_is_empty) {
98
0
            PyErr_SetString(PyExc_ValueError, "cannot switch from "
99
0
                            "automatic field numbering to "
100
0
                            "manual field specification");
101
0
            return 1;
102
0
        }
103
17.3M
    }
104
17.3M
    return 0;
105
17.3M
}
106
107
108
/************************************************************************/
109
/***********  Format string parsing -- integers and identifiers *********/
110
/************************************************************************/
111
112
static Py_ssize_t
113
get_integer(const SubString *str)
114
17.3M
{
115
17.3M
    Py_ssize_t accumulator = 0;
116
17.3M
    Py_ssize_t digitval;
117
17.3M
    Py_ssize_t i;
118
119
    /* empty string is an error */
120
17.3M
    if (str->start >= str->end)
121
17.3M
        return -1;
122
123
45.9k
    for (i = str->start; i < str->end; i++) {
124
45.4k
        digitval = Py_UNICODE_TODECIMAL(PyUnicode_READ_CHAR(str->str, i));
125
45.4k
        if (digitval < 0)
126
45.0k
            return -1;
127
        /*
128
           Detect possible overflow before it happens:
129
130
              accumulator * 10 + digitval > PY_SSIZE_T_MAX if and only if
131
              accumulator > (PY_SSIZE_T_MAX - digitval) / 10.
132
        */
133
432
        if (accumulator > (PY_SSIZE_T_MAX - digitval) / 10) {
134
0
            PyErr_Format(PyExc_ValueError,
135
0
                         "Too many decimal digits in format string");
136
0
            return -1;
137
0
        }
138
432
        accumulator = accumulator * 10 + digitval;
139
432
    }
140
432
    return accumulator;
141
45.4k
}
142
143
/************************************************************************/
144
/******** Functions to get field objects and specification strings ******/
145
/************************************************************************/
146
147
/* do the equivalent of obj.name */
148
static PyObject *
149
getattr(PyObject *obj, SubString *name)
150
4
{
151
4
    PyObject *newobj;
152
4
    PyObject *str = SubString_new_object(name);
153
4
    if (str == NULL)
154
0
        return NULL;
155
4
    newobj = PyObject_GetAttr(obj, str);
156
4
    Py_DECREF(str);
157
4
    return newobj;
158
4
}
159
160
/* do the equivalent of obj[idx], where obj is a sequence */
161
static PyObject *
162
getitem_sequence(PyObject *obj, Py_ssize_t idx)
163
0
{
164
0
    return PySequence_GetItem(obj, idx);
165
0
}
166
167
/* do the equivalent of obj[idx], where obj is not a sequence */
168
static PyObject *
169
getitem_idx(PyObject *obj, Py_ssize_t idx)
170
0
{
171
0
    PyObject *newobj;
172
0
    PyObject *idx_obj = PyLong_FromSsize_t(idx);
173
0
    if (idx_obj == NULL)
174
0
        return NULL;
175
0
    newobj = PyObject_GetItem(obj, idx_obj);
176
0
    Py_DECREF(idx_obj);
177
0
    return newobj;
178
0
}
179
180
/* do the equivalent of obj[name] */
181
static PyObject *
182
getitem_str(PyObject *obj, SubString *name)
183
0
{
184
0
    PyObject *newobj;
185
0
    PyObject *str = SubString_new_object(name);
186
0
    if (str == NULL)
187
0
        return NULL;
188
0
    newobj = PyObject_GetItem(obj, str);
189
0
    Py_DECREF(str);
190
0
    return newobj;
191
0
}
192
193
typedef struct {
194
    /* the entire string we're parsing.  we assume that someone else
195
       is managing its lifetime, and that it will exist for the
196
       lifetime of the iterator.  can be empty */
197
    SubString str;
198
199
    /* index to where we are inside field_name */
200
    Py_ssize_t index;
201
} FieldNameIterator;
202
203
204
static int
205
FieldNameIterator_init(FieldNameIterator *self, PyObject *s,
206
                       Py_ssize_t start, Py_ssize_t end)
207
17.3M
{
208
17.3M
    SubString_init(&self->str, s, start, end);
209
17.3M
    self->index = start;
210
17.3M
    return 1;
211
17.3M
}
212
213
static int
214
_FieldNameIterator_attr(FieldNameIterator *self, SubString *name)
215
4
{
216
4
    Py_UCS4 c;
217
218
4
    name->str = self->str.str;
219
4
    name->start = self->index;
220
221
    /* return everything until '.' or '[' */
222
52
    while (self->index < self->str.end) {
223
48
        c = PyUnicode_READ_CHAR(self->str.str, self->index++);
224
48
        switch (c) {
225
0
        case '[':
226
0
        case '.':
227
            /* backup so that we this character will be seen next time */
228
0
            self->index--;
229
0
            break;
230
48
        default:
231
48
            continue;
232
48
        }
233
0
        break;
234
48
    }
235
    /* end of string is okay */
236
4
    name->end = self->index;
237
4
    return 1;
238
4
}
239
240
static int
241
_FieldNameIterator_item(FieldNameIterator *self, SubString *name)
242
0
{
243
0
    int bracket_seen = 0;
244
0
    Py_UCS4 c;
245
246
0
    name->str = self->str.str;
247
0
    name->start = self->index;
248
249
    /* return everything until ']' */
250
0
    while (self->index < self->str.end) {
251
0
        c = PyUnicode_READ_CHAR(self->str.str, self->index++);
252
0
        switch (c) {
253
0
        case ']':
254
0
            bracket_seen = 1;
255
0
            break;
256
0
        default:
257
0
            continue;
258
0
        }
259
0
        break;
260
0
    }
261
    /* make sure we ended with a ']' */
262
0
    if (!bracket_seen) {
263
0
        PyErr_SetString(PyExc_ValueError, "Missing ']' in format string");
264
0
        return 0;
265
0
    }
266
267
    /* end of string is okay */
268
    /* don't include the ']' */
269
0
    name->end = self->index-1;
270
0
    return 1;
271
0
}
272
273
/* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */
274
static int
275
FieldNameIterator_next(FieldNameIterator *self, int *is_attribute,
276
                       Py_ssize_t *name_idx, SubString *name)
277
17.3M
{
278
    /* check at end of input */
279
17.3M
    if (self->index >= self->str.end)
280
17.3M
        return 1;
281
282
4
    switch (PyUnicode_READ_CHAR(self->str.str, self->index++)) {
283
4
    case '.':
284
4
        *is_attribute = 1;
285
4
        if (_FieldNameIterator_attr(self, name) == 0)
286
0
            return 0;
287
4
        *name_idx = -1;
288
4
        break;
289
0
    case '[':
290
0
        *is_attribute = 0;
291
0
        if (_FieldNameIterator_item(self, name) == 0)
292
0
            return 0;
293
0
        *name_idx = get_integer(name);
294
0
        if (*name_idx == -1 && PyErr_Occurred())
295
0
            return 0;
296
0
        break;
297
0
    default:
298
        /* Invalid character follows ']' */
299
0
        PyErr_SetString(PyExc_ValueError, "Only '.' or '[' may "
300
0
                        "follow ']' in format field specifier");
301
0
        return 0;
302
4
    }
303
304
    /* empty string is an error */
305
4
    if (name->start == name->end) {
306
0
        PyErr_SetString(PyExc_ValueError, "Empty attribute in format string");
307
0
        return 0;
308
0
    }
309
310
4
    return 2;
311
4
}
312
313
314
/* input: field_name
315
   output: 'first' points to the part before the first '[' or '.'
316
           'first_idx' is -1 if 'first' is not an integer, otherwise
317
                       it's the value of first converted to an integer
318
           'rest' is an iterator to return the rest
319
*/
320
static int
321
field_name_split(PyObject *str, Py_ssize_t start, Py_ssize_t end, SubString *first,
322
                 Py_ssize_t *first_idx, FieldNameIterator *rest,
323
                 AutoNumber *auto_number)
324
17.3M
{
325
17.3M
    Py_UCS4 c;
326
17.3M
    Py_ssize_t i = start;
327
17.3M
    int field_name_is_empty;
328
17.3M
    int using_numeric_index;
329
330
    /* find the part up until the first '.' or '[' */
331
17.6M
    while (i < end) {
332
326k
        switch (c = PyUnicode_READ_CHAR(str, i++)) {
333
0
        case '[':
334
4
        case '.':
335
            /* backup so that we this character is available to the
336
               "rest" iterator */
337
4
            i--;
338
4
            break;
339
326k
        default:
340
326k
            continue;
341
326k
        }
342
4
        break;
343
326k
    }
344
345
    /* set up the return values */
346
17.3M
    SubString_init(first, str, start, i);
347
17.3M
    FieldNameIterator_init(rest, str, i, end);
348
349
    /* see if "first" is an integer, in which case it's used as an index */
350
17.3M
    *first_idx = get_integer(first);
351
17.3M
    if (*first_idx == -1 && PyErr_Occurred())
352
0
        return 0;
353
354
17.3M
    field_name_is_empty = first->start >= first->end;
355
356
    /* If the field name is omitted or if we have a numeric index
357
       specified, then we're doing numeric indexing into args. */
358
17.3M
    using_numeric_index = field_name_is_empty || *first_idx != -1;
359
360
    /* We always get here exactly one time for each field we're
361
       processing. And we get here in field order (counting by left
362
       braces). So this is the perfect place to handle automatic field
363
       numbering if the field name is omitted. */
364
365
    /* Check if we need to do the auto-numbering. It's not needed if
366
       we're called from string.Format routines, because it's handled
367
       in that class by itself. */
368
17.3M
    if (auto_number) {
369
        /* Initialize our auto numbering state if this is the first
370
           time we're either auto-numbering or manually numbering. */
371
17.3M
        if (auto_number->an_state == ANS_INIT && using_numeric_index)
372
9.22M
            auto_number->an_state = field_name_is_empty ?
373
9.22M
                ANS_AUTO : ANS_MANUAL;
374
375
        /* Make sure our state is consistent with what we're doing
376
           this time through. Only check if we're using a numeric
377
           index. */
378
17.3M
        if (using_numeric_index)
379
17.3M
            if (autonumber_state_error(auto_number->an_state,
380
17.3M
                                       field_name_is_empty))
381
0
                return 0;
382
        /* Zero length field means we want to do auto-numbering of the
383
           fields. */
384
17.3M
        if (field_name_is_empty)
385
17.3M
            *first_idx = (auto_number->an_field_number)++;
386
17.3M
    }
387
388
17.3M
    return 1;
389
17.3M
}
390
391
392
/*
393
    get_field_object returns the object inside {}, before the
394
    format_spec.  It handles getindex and getattr lookups and consumes
395
    the entire input string.
396
*/
397
static PyObject *
398
get_field_object(SubString *input, PyObject *args, PyObject *kwargs,
399
                 AutoNumber *auto_number)
400
17.3M
{
401
17.3M
    PyObject *obj = NULL;
402
17.3M
    int ok;
403
17.3M
    int is_attribute;
404
17.3M
    SubString name;
405
17.3M
    SubString first;
406
17.3M
    Py_ssize_t index;
407
17.3M
    FieldNameIterator rest;
408
409
17.3M
    if (!field_name_split(input->str, input->start, input->end, &first,
410
17.3M
                          &index, &rest, auto_number)) {
411
0
        goto error;
412
0
    }
413
414
17.3M
    if (index == -1) {
415
        /* look up in kwargs */
416
45.0k
        PyObject *key = SubString_new_object(&first);
417
45.0k
        if (key == NULL) {
418
0
            goto error;
419
0
        }
420
45.0k
        if (kwargs == NULL) {
421
0
            PyErr_SetObject(PyExc_KeyError, key);
422
0
            Py_DECREF(key);
423
0
            goto error;
424
0
        }
425
        /* Use PyObject_GetItem instead of PyDict_GetItem because this
426
           code is no longer just used with kwargs. It might be passed
427
           a non-dict when called through format_map. */
428
45.0k
        obj = PyObject_GetItem(kwargs, key);
429
45.0k
        Py_DECREF(key);
430
45.0k
        if (obj == NULL) {
431
0
            goto error;
432
0
        }
433
45.0k
    }
434
17.3M
    else {
435
        /* If args is NULL, we have a format string with a positional field
436
           with only kwargs to retrieve it from. This can only happen when
437
           used with format_map(), where positional arguments are not
438
           allowed. */
439
17.3M
        if (args == NULL) {
440
0
            PyErr_SetString(PyExc_ValueError, "Format string contains "
441
0
                            "positional fields");
442
0
            goto error;
443
0
        }
444
445
        /* look up in args */
446
17.3M
        obj = PySequence_GetItem(args, index);
447
17.3M
        if (obj == NULL) {
448
0
            PyErr_Format(PyExc_IndexError,
449
0
                         "Replacement index %zd out of range for positional "
450
0
                         "args tuple",
451
0
                         index);
452
0
             goto error;
453
0
        }
454
17.3M
    }
455
456
    /* iterate over the rest of the field_name */
457
17.3M
    while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index,
458
17.3M
                                        &name)) == 2) {
459
4
        PyObject *tmp;
460
461
4
        if (is_attribute)
462
            /* getattr lookup "." */
463
4
            tmp = getattr(obj, &name);
464
0
        else
465
            /* getitem lookup "[]" */
466
0
            if (index == -1)
467
0
                tmp = getitem_str(obj, &name);
468
0
            else
469
0
                if (PySequence_Check(obj))
470
0
                    tmp = getitem_sequence(obj, index);
471
0
                else
472
                    /* not a sequence */
473
0
                    tmp = getitem_idx(obj, index);
474
4
        if (tmp == NULL)
475
0
            goto error;
476
477
        /* assign to obj */
478
4
        Py_SETREF(obj, tmp);
479
4
    }
480
    /* end of iterator, this is the non-error case */
481
17.3M
    if (ok == 1)
482
17.3M
        return obj;
483
0
error:
484
0
    Py_XDECREF(obj);
485
0
    return NULL;
486
17.3M
}
487
488
/************************************************************************/
489
/*****************  Field rendering functions  **************************/
490
/************************************************************************/
491
492
/*
493
    render_field() is the main function in this section.  It takes the
494
    field object and field specification string generated by
495
    get_field_and_spec, and renders the field into the output string.
496
497
    render_field calls fieldobj.__format__(format_spec) method, and
498
    appends to the output.
499
*/
500
static int
501
render_field(PyObject *fieldobj, SubString *format_spec, _PyUnicodeWriter *writer)
502
17.3M
{
503
17.3M
    int ok = 0;
504
17.3M
    PyObject *result = NULL;
505
17.3M
    PyObject *format_spec_object = NULL;
506
17.3M
    int (*formatter) (_PyUnicodeWriter*, PyObject *, PyObject *, Py_ssize_t, Py_ssize_t) = NULL;
507
17.3M
    int err;
508
509
    /* If we know the type exactly, skip the lookup of __format__ and just
510
       call the formatter directly. */
511
17.3M
    if (PyUnicode_CheckExact(fieldobj))
512
17.1M
        formatter = _PyUnicode_FormatAdvancedWriter;
513
224k
    else if (PyLong_CheckExact(fieldobj))
514
58.7k
        formatter = _PyLong_FormatAdvancedWriter;
515
166k
    else if (PyFloat_CheckExact(fieldobj))
516
0
        formatter = _PyFloat_FormatAdvancedWriter;
517
166k
    else if (PyComplex_CheckExact(fieldobj))
518
0
        formatter = _PyComplex_FormatAdvancedWriter;
519
520
17.3M
    if (formatter) {
521
        /* we know exactly which formatter will be called when __format__ is
522
           looked up, so call it directly, instead. */
523
17.1M
        err = formatter(writer, fieldobj, format_spec->str,
524
17.1M
                        format_spec->start, format_spec->end);
525
17.1M
        return (err == 0);
526
17.1M
    }
527
166k
    else {
528
        /* We need to create an object out of the pointers we have, because
529
           __format__ takes a string/unicode object for format_spec. */
530
166k
        if (format_spec->str)
531
0
            format_spec_object = PyUnicode_Substring(format_spec->str,
532
0
                                                     format_spec->start,
533
0
                                                     format_spec->end);
534
166k
        else
535
166k
            format_spec_object = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
536
166k
        if (format_spec_object == NULL)
537
0
            goto done;
538
539
166k
        result = PyObject_Format(fieldobj, format_spec_object);
540
166k
    }
541
166k
    if (result == NULL)
542
1
        goto done;
543
544
166k
    if (_PyUnicodeWriter_WriteStr(writer, result) == -1)
545
0
        goto done;
546
166k
    ok = 1;
547
548
166k
done:
549
166k
    Py_XDECREF(format_spec_object);
550
166k
    Py_XDECREF(result);
551
166k
    return ok;
552
166k
}
553
554
static int
555
parse_field(SubString *str, SubString *field_name, SubString *format_spec,
556
            int *format_spec_needs_expanding, Py_UCS4 *conversion)
557
17.3M
{
558
    /* Note this function works if the field name is zero length,
559
       which is good.  Zero length field names are handled later, in
560
       field_name_split. */
561
562
17.3M
    Py_UCS4 c = 0;
563
564
    /* initialize these, as they may be empty */
565
17.3M
    *conversion = '\0';
566
17.3M
    SubString_init(format_spec, NULL, 0, 0);
567
568
    /* Search for the field name.  it's terminated by the end of
569
       the string, or a ':' or '!' */
570
17.3M
    field_name->str = str->str;
571
17.3M
    field_name->start = str->start;
572
17.6M
    while (str->start < str->end) {
573
17.6M
        switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
574
0
        case '{':
575
0
            PyErr_SetString(PyExc_ValueError, "unexpected '{' in field name");
576
0
            return 0;
577
0
        case '[':
578
0
            for (; str->start < str->end; str->start++)
579
0
                if (PyUnicode_READ_CHAR(str->str, str->start) == ']')
580
0
                    break;
581
0
            continue;
582
16.7M
        case '}':
583
16.7M
        case ':':
584
17.3M
        case '!':
585
17.3M
            break;
586
327k
        default:
587
327k
            continue;
588
17.6M
        }
589
17.3M
        break;
590
17.6M
    }
591
592
17.3M
    field_name->end = str->start - 1;
593
17.3M
    if (c == '!' || c == ':') {
594
574k
        Py_ssize_t count;
595
        /* we have a format specifier and/or a conversion */
596
        /* don't include the last character */
597
598
        /* see if there's a conversion specifier */
599
574k
        if (c == '!') {
600
            /* there must be another character present */
601
574k
            if (str->start >= str->end) {
602
0
                PyErr_SetString(PyExc_ValueError,
603
0
                                "end of string while looking for conversion "
604
0
                                "specifier");
605
0
                return 0;
606
0
            }
607
574k
            *conversion = PyUnicode_READ_CHAR(str->str, str->start++);
608
609
574k
            if (str->start < str->end) {
610
574k
                c = PyUnicode_READ_CHAR(str->str, str->start++);
611
574k
                if (c == '}')
612
574k
                    return 1;
613
0
                if (c != ':') {
614
0
                    PyErr_SetString(PyExc_ValueError,
615
0
                                    "expected ':' after conversion specifier");
616
0
                    return 0;
617
0
                }
618
0
            }
619
574k
        }
620
192
        format_spec->str = str->str;
621
192
        format_spec->start = str->start;
622
192
        count = 1;
623
768
        while (str->start < str->end) {
624
768
            switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
625
0
            case '{':
626
0
                *format_spec_needs_expanding = 1;
627
0
                count++;
628
0
                break;
629
192
            case '}':
630
192
                count--;
631
192
                if (count == 0) {
632
192
                    format_spec->end = str->start - 1;
633
192
                    return 1;
634
192
                }
635
0
                break;
636
576
            default:
637
576
                break;
638
768
            }
639
768
        }
640
641
0
        PyErr_SetString(PyExc_ValueError, "unmatched '{' in format spec");
642
0
        return 0;
643
192
    }
644
16.7M
    else if (c != '}') {
645
0
        PyErr_SetString(PyExc_ValueError, "expected '}' before end of string");
646
0
        return 0;
647
0
    }
648
649
16.7M
    return 1;
650
17.3M
}
651
652
/************************************************************************/
653
/******* Output string allocation and escape-to-markup processing  ******/
654
/************************************************************************/
655
656
/* MarkupIterator breaks the string into pieces of either literal
657
   text, or things inside {} that need to be marked up.  it is
658
   designed to make it easy to wrap a Python iterator around it, for
659
   use with the Formatter class */
660
661
typedef struct {
662
    SubString str;
663
} MarkupIterator;
664
665
static int
666
MarkupIterator_init(MarkupIterator *self, PyObject *str,
667
                    Py_ssize_t start, Py_ssize_t end)
668
9.23M
{
669
9.23M
    SubString_init(&self->str, str, start, end);
670
9.23M
    return 1;
671
9.23M
}
672
673
/* returns 0 on error, 1 on non-error termination, and 2 if it got a
674
   string (or something to be expanded) */
675
static int
676
MarkupIterator_next(MarkupIterator *self, SubString *literal,
677
                    int *field_present, SubString *field_name,
678
                    SubString *format_spec, Py_UCS4 *conversion,
679
                    int *format_spec_needs_expanding)
680
34.9M
{
681
34.9M
    int at_end;
682
34.9M
    Py_UCS4 c = 0;
683
34.9M
    Py_ssize_t start;
684
34.9M
    Py_ssize_t len;
685
34.9M
    int markup_follows = 0;
686
687
    /* initialize all of the output variables */
688
34.9M
    SubString_init(literal, NULL, 0, 0);
689
34.9M
    SubString_init(field_name, NULL, 0, 0);
690
34.9M
    SubString_init(format_spec, NULL, 0, 0);
691
34.9M
    *conversion = '\0';
692
34.9M
    *format_spec_needs_expanding = 0;
693
34.9M
    *field_present = 0;
694
695
    /* No more input, end of iterator.  This is the normal exit
696
       path. */
697
34.9M
    if (self->str.start >= self->str.end)
698
9.23M
        return 1;
699
700
25.7M
    start = self->str.start;
701
702
    /* First read any literal text. Read until the end of string, an
703
       escaped '{' or '}', or an unescaped '{'.  In order to never
704
       allocate memory and so I can just pass pointers around, if
705
       there's an escaped '{' or '}' then we'll return the literal
706
       including the brace, but no format object.  The next time
707
       through, we'll return the rest of the literal, skipping past
708
       the second consecutive brace. */
709
78.2M
    while (self->str.start < self->str.end) {
710
69.8M
        switch (c = PyUnicode_READ_CHAR(self->str.str, self->str.start++)) {
711
17.3M
        case '{':
712
17.3M
        case '}':
713
17.3M
            markup_follows = 1;
714
17.3M
            break;
715
52.4M
        default:
716
52.4M
            continue;
717
69.8M
        }
718
17.3M
        break;
719
69.8M
    }
720
721
25.7M
    at_end = self->str.start >= self->str.end;
722
25.7M
    len = self->str.start - start;
723
724
25.7M
    if ((c == '}') && (at_end ||
725
0
                       (c != PyUnicode_READ_CHAR(self->str.str,
726
0
                                                 self->str.start)))) {
727
0
        PyErr_SetString(PyExc_ValueError, "Single '}' encountered "
728
0
                        "in format string");
729
0
        return 0;
730
0
    }
731
25.7M
    if (at_end && c == '{') {
732
0
        PyErr_SetString(PyExc_ValueError, "Single '{' encountered "
733
0
                        "in format string");
734
0
        return 0;
735
0
    }
736
25.7M
    if (!at_end) {
737
17.3M
        if (c == PyUnicode_READ_CHAR(self->str.str, self->str.start)) {
738
            /* escaped } or {, skip it in the input.  there is no
739
               markup object following us, just this literal text */
740
0
            self->str.start++;
741
0
            markup_follows = 0;
742
0
        }
743
17.3M
        else
744
17.3M
            len--;
745
17.3M
    }
746
747
    /* record the literal text */
748
25.7M
    literal->str = self->str.str;
749
25.7M
    literal->start = start;
750
25.7M
    literal->end = start + len;
751
752
25.7M
    if (!markup_follows)
753
8.39M
        return 2;
754
755
    /* this is markup; parse the field */
756
17.3M
    *field_present = 1;
757
17.3M
    if (!parse_field(&self->str, field_name, format_spec,
758
17.3M
                     format_spec_needs_expanding, conversion))
759
0
        return 0;
760
17.3M
    return 2;
761
17.3M
}
762
763
764
/* do the !r or !s conversion on obj */
765
static PyObject *
766
do_conversion(PyObject *obj, Py_UCS4 conversion)
767
574k
{
768
    /* XXX in pre-3.0, do we need to convert this to unicode, since it
769
       might have returned a string? */
770
574k
    switch (conversion) {
771
574k
    case 'r':
772
574k
        return PyObject_Repr(obj);
773
0
    case 's':
774
0
        return PyObject_Str(obj);
775
0
    case 'a':
776
0
        return PyObject_ASCII(obj);
777
0
    default:
778
0
        if (conversion > 32 && conversion < 127) {
779
                /* It's the ASCII subrange; casting to char is safe
780
                   (assuming the execution character set is an ASCII
781
                   superset). */
782
0
                PyErr_Format(PyExc_ValueError,
783
0
                     "Unknown conversion specifier %c",
784
0
                     (char)conversion);
785
0
        } else
786
0
                PyErr_Format(PyExc_ValueError,
787
0
                     "Unknown conversion specifier \\x%x",
788
0
                     (unsigned int)conversion);
789
0
        return NULL;
790
574k
    }
791
574k
}
792
793
/* given:
794
795
   {field_name!conversion:format_spec}
796
797
   compute the result and write it to output.
798
   format_spec_needs_expanding is an optimization.  if it's false,
799
   just output the string directly, otherwise recursively expand the
800
   format_spec string.
801
802
   field_name is allowed to be zero length, in which case we
803
   are doing auto field numbering.
804
*/
805
806
static int
807
output_markup(SubString *field_name, SubString *format_spec,
808
              int format_spec_needs_expanding, Py_UCS4 conversion,
809
              _PyUnicodeWriter *writer, PyObject *args, PyObject *kwargs,
810
              int recursion_depth, AutoNumber *auto_number)
811
17.3M
{
812
17.3M
    PyObject *tmp = NULL;
813
17.3M
    PyObject *fieldobj = NULL;
814
17.3M
    SubString expanded_format_spec;
815
17.3M
    SubString *actual_format_spec;
816
17.3M
    int result = 0;
817
818
    /* convert field_name to an object */
819
17.3M
    fieldobj = get_field_object(field_name, args, kwargs, auto_number);
820
17.3M
    if (fieldobj == NULL)
821
0
        goto done;
822
823
17.3M
    if (conversion != '\0') {
824
574k
        tmp = do_conversion(fieldobj, conversion);
825
574k
        if (tmp == NULL)
826
0
            goto done;
827
828
        /* do the assignment, transferring ownership: fieldobj = tmp */
829
574k
        Py_SETREF(fieldobj, tmp);
830
574k
        tmp = NULL;
831
574k
    }
832
833
    /* if needed, recursively compute the format_spec */
834
17.3M
    if (format_spec_needs_expanding) {
835
0
        tmp = build_string(format_spec, args, kwargs, recursion_depth-1,
836
0
                           auto_number);
837
0
        if (tmp == NULL)
838
0
            goto done;
839
840
        /* note that in the case we're expanding the format string,
841
           tmp must be kept around until after the call to
842
           render_field. */
843
0
        SubString_init(&expanded_format_spec, tmp, 0, PyUnicode_GET_LENGTH(tmp));
844
0
        actual_format_spec = &expanded_format_spec;
845
0
    }
846
17.3M
    else
847
17.3M
        actual_format_spec = format_spec;
848
849
17.3M
    if (render_field(fieldobj, actual_format_spec, writer) == 0)
850
1
        goto done;
851
852
17.3M
    result = 1;
853
854
17.3M
done:
855
17.3M
    Py_XDECREF(fieldobj);
856
17.3M
    Py_XDECREF(tmp);
857
858
17.3M
    return result;
859
17.3M
}
860
861
/*
862
    do_markup is the top-level loop for the format() method.  It
863
    searches through the format string for escapes to markup codes, and
864
    calls other functions to move non-markup text to the output,
865
    and to perform the markup to the output.
866
*/
867
static int
868
do_markup(SubString *input, PyObject *args, PyObject *kwargs,
869
          _PyUnicodeWriter *writer, int recursion_depth, AutoNumber *auto_number)
870
9.23M
{
871
9.23M
    MarkupIterator iter;
872
9.23M
    int format_spec_needs_expanding;
873
9.23M
    int result;
874
9.23M
    int field_present;
875
9.23M
    SubString literal;
876
9.23M
    SubString field_name;
877
9.23M
    SubString format_spec;
878
9.23M
    Py_UCS4 conversion;
879
880
9.23M
    MarkupIterator_init(&iter, input->str, input->start, input->end);
881
34.9M
    while ((result = MarkupIterator_next(&iter, &literal, &field_present,
882
34.9M
                                         &field_name, &format_spec,
883
34.9M
                                         &conversion,
884
34.9M
                                         &format_spec_needs_expanding)) == 2) {
885
25.7M
        if (literal.end != literal.start) {
886
17.6M
            if (!field_present && iter.str.start == iter.str.end)
887
8.39M
                writer->overallocate = 0;
888
17.6M
            if (_PyUnicodeWriter_WriteSubstring(writer, literal.str,
889
17.6M
                                                literal.start, literal.end) < 0)
890
0
                return 0;
891
17.6M
        }
892
893
25.7M
        if (field_present) {
894
17.3M
            if (iter.str.start == iter.str.end)
895
839k
                writer->overallocate = 0;
896
17.3M
            if (!output_markup(&field_name, &format_spec,
897
17.3M
                               format_spec_needs_expanding, conversion, writer,
898
17.3M
                               args, kwargs, recursion_depth, auto_number))
899
1
                return 0;
900
17.3M
        }
901
25.7M
    }
902
9.23M
    return result;
903
9.23M
}
904
905
906
/*
907
    build_string allocates the output string and then
908
    calls do_markup to do the heavy lifting.
909
*/
910
static PyObject *
911
build_string(SubString *input, PyObject *args, PyObject *kwargs,
912
             int recursion_depth, AutoNumber *auto_number)
913
9.23M
{
914
9.23M
    _PyUnicodeWriter writer;
915
916
    /* check the recursion level */
917
9.23M
    if (recursion_depth <= 0) {
918
0
        PyErr_SetString(PyExc_ValueError,
919
0
                        "Max string recursion exceeded");
920
0
        return NULL;
921
0
    }
922
923
9.23M
    _PyUnicodeWriter_Init(&writer);
924
9.23M
    writer.overallocate = 1;
925
9.23M
    writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
926
927
9.23M
    if (!do_markup(input, args, kwargs, &writer, recursion_depth,
928
9.23M
                   auto_number)) {
929
1
        _PyUnicodeWriter_Dealloc(&writer);
930
1
        return NULL;
931
1
    }
932
933
9.23M
    return _PyUnicodeWriter_Finish(&writer);
934
9.23M
}
935
936
/************************************************************************/
937
/*********** main routine ***********************************************/
938
/************************************************************************/
939
940
/* this is the main entry point */
941
static PyObject *
942
do_string_format(PyObject *self, PyObject *args, PyObject *kwargs)
943
9.23M
{
944
9.23M
    SubString input;
945
946
    /* PEP 3101 says only 2 levels, so that
947
       "{0:{1}}".format('abc', 's')            # works
948
       "{0:{1:{2}}}".format('abc', 's', '')    # fails
949
    */
950
9.23M
    int recursion_depth = 2;
951
952
9.23M
    AutoNumber auto_number;
953
9.23M
    AutoNumber_Init(&auto_number);
954
9.23M
    SubString_init(&input, self, 0, PyUnicode_GET_LENGTH(self));
955
9.23M
    return build_string(&input, args, kwargs, recursion_depth, &auto_number);
956
9.23M
}
957
958
static PyObject *
959
do_string_format_map(PyObject *self, PyObject *obj)
960
0
{
961
0
    return do_string_format(self, NULL, obj);
962
0
}
963
964
965
/************************************************************************/
966
/*********** formatteriterator ******************************************/
967
/************************************************************************/
968
969
/* This is used to implement string.Formatter.vparse().  It exists so
970
   Formatter can share code with the built in unicode.format() method.
971
   It's really just a wrapper around MarkupIterator that is callable
972
   from Python. */
973
974
typedef struct {
975
    PyObject_HEAD
976
    PyObject *str;
977
    MarkupIterator it_markup;
978
} formatteriterobject;
979
980
static void
981
formatteriter_dealloc(PyObject *op)
982
0
{
983
0
    formatteriterobject *it = (formatteriterobject*)op;
984
0
    Py_XDECREF(it->str);
985
0
    PyObject_Free(it);
986
0
}
987
988
/* returns a tuple:
989
   (literal, field_name, format_spec, conversion)
990
991
   literal is any literal text to output.  might be zero length
992
   field_name is the string before the ':'.  might be None
993
   format_spec is the string after the ':'.  mibht be None
994
   conversion is either None, or the string after the '!'
995
*/
996
static PyObject *
997
formatteriter_next(PyObject *op)
998
0
{
999
0
    formatteriterobject *it = (formatteriterobject*)op;
1000
0
    SubString literal;
1001
0
    SubString field_name;
1002
0
    SubString format_spec;
1003
0
    Py_UCS4 conversion;
1004
0
    int format_spec_needs_expanding;
1005
0
    int field_present;
1006
0
    int result = MarkupIterator_next(&it->it_markup, &literal, &field_present,
1007
0
                                     &field_name, &format_spec, &conversion,
1008
0
                                     &format_spec_needs_expanding);
1009
1010
    /* all of the SubString objects point into it->str, so no
1011
       memory management needs to be done on them */
1012
0
    assert(0 <= result && result <= 2);
1013
0
    if (result == 0 || result == 1)
1014
        /* if 0, error has already been set, if 1, iterator is empty */
1015
0
        return NULL;
1016
0
    else {
1017
0
        PyObject *literal_str = NULL;
1018
0
        PyObject *field_name_str = NULL;
1019
0
        PyObject *format_spec_str = NULL;
1020
0
        PyObject *conversion_str = NULL;
1021
0
        PyObject *tuple = NULL;
1022
1023
0
        literal_str = SubString_new_object(&literal);
1024
0
        if (literal_str == NULL)
1025
0
            goto done;
1026
1027
0
        field_name_str = SubString_new_object(&field_name);
1028
0
        if (field_name_str == NULL)
1029
0
            goto done;
1030
1031
        /* if field_name is non-zero length, return a string for
1032
           format_spec (even if zero length), else return None */
1033
0
        format_spec_str = (field_present ?
1034
0
                           SubString_new_object_or_empty :
1035
0
                           SubString_new_object)(&format_spec);
1036
0
        if (format_spec_str == NULL)
1037
0
            goto done;
1038
1039
        /* if the conversion is not specified, return a None,
1040
           otherwise create a one length string with the conversion
1041
           character */
1042
0
        if (conversion == '\0') {
1043
0
            conversion_str = Py_NewRef(Py_None);
1044
0
        }
1045
0
        else
1046
0
            conversion_str = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
1047
0
                                                       &conversion, 1);
1048
0
        if (conversion_str == NULL)
1049
0
            goto done;
1050
1051
0
        tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str,
1052
0
                             conversion_str);
1053
0
    done:
1054
0
        Py_XDECREF(literal_str);
1055
0
        Py_XDECREF(field_name_str);
1056
0
        Py_XDECREF(format_spec_str);
1057
0
        Py_XDECREF(conversion_str);
1058
0
        return tuple;
1059
0
    }
1060
0
}
1061
1062
static PyMethodDef formatteriter_methods[] = {
1063
    {NULL,              NULL}           /* sentinel */
1064
};
1065
1066
static PyTypeObject PyFormatterIter_Type = {
1067
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
1068
    "formatteriterator",                /* tp_name */
1069
    sizeof(formatteriterobject),        /* tp_basicsize */
1070
    0,                                  /* tp_itemsize */
1071
    /* methods */
1072
    formatteriter_dealloc,              /* tp_dealloc */
1073
    0,                                  /* tp_vectorcall_offset */
1074
    0,                                  /* tp_getattr */
1075
    0,                                  /* tp_setattr */
1076
    0,                                  /* tp_as_async */
1077
    0,                                  /* tp_repr */
1078
    0,                                  /* tp_as_number */
1079
    0,                                  /* tp_as_sequence */
1080
    0,                                  /* tp_as_mapping */
1081
    0,                                  /* tp_hash */
1082
    0,                                  /* tp_call */
1083
    0,                                  /* tp_str */
1084
    PyObject_GenericGetAttr,            /* tp_getattro */
1085
    0,                                  /* tp_setattro */
1086
    0,                                  /* tp_as_buffer */
1087
    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1088
    0,                                  /* tp_doc */
1089
    0,                                  /* tp_traverse */
1090
    0,                                  /* tp_clear */
1091
    0,                                  /* tp_richcompare */
1092
    0,                                  /* tp_weaklistoffset */
1093
    PyObject_SelfIter,                  /* tp_iter */
1094
    formatteriter_next,                 /* tp_iternext */
1095
    formatteriter_methods,              /* tp_methods */
1096
    0,
1097
};
1098
1099
/* unicode_formatter_parser is used to implement
1100
   string.Formatter.vformat.  it parses a string and returns tuples
1101
   describing the parsed elements.  It's a wrapper around
1102
   stringlib/string_format.h's MarkupIterator */
1103
static PyObject *
1104
formatter_parser(PyObject *Py_UNUSED(module), PyObject *self)
1105
0
{
1106
0
    formatteriterobject *it;
1107
1108
0
    if (!PyUnicode_Check(self)) {
1109
0
        PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1110
0
        return NULL;
1111
0
    }
1112
1113
0
    it = PyObject_New(formatteriterobject, &PyFormatterIter_Type);
1114
0
    if (it == NULL)
1115
0
        return NULL;
1116
1117
    /* take ownership, give the object to the iterator */
1118
0
    it->str = Py_NewRef(self);
1119
1120
    /* initialize the contained MarkupIterator */
1121
0
    MarkupIterator_init(&it->it_markup, (PyObject*)self, 0, PyUnicode_GET_LENGTH(self));
1122
0
    return (PyObject *)it;
1123
0
}
1124
1125
1126
/************************************************************************/
1127
/*********** fieldnameiterator ******************************************/
1128
/************************************************************************/
1129
1130
1131
/* This is used to implement string.Formatter.vparse().  It parses the
1132
   field name into attribute and item values.  It's a Python-callable
1133
   wrapper around FieldNameIterator */
1134
1135
typedef struct {
1136
    PyObject_HEAD
1137
    PyObject *str;
1138
    FieldNameIterator it_field;
1139
} fieldnameiterobject;
1140
1141
static void
1142
fieldnameiter_dealloc(PyObject *op)
1143
0
{
1144
0
    fieldnameiterobject *it = (fieldnameiterobject*)op;
1145
0
    Py_XDECREF(it->str);
1146
0
    PyObject_Free(it);
1147
0
}
1148
1149
/* returns a tuple:
1150
   (is_attr, value)
1151
   is_attr is true if we used attribute syntax (e.g., '.foo')
1152
              false if we used index syntax (e.g., '[foo]')
1153
   value is an integer or string
1154
*/
1155
static PyObject *
1156
fieldnameiter_next(PyObject *op)
1157
0
{
1158
0
    fieldnameiterobject *it = (fieldnameiterobject*)op;
1159
0
    int result;
1160
0
    int is_attr;
1161
0
    Py_ssize_t idx;
1162
0
    SubString name;
1163
1164
0
    result = FieldNameIterator_next(&it->it_field, &is_attr,
1165
0
                                    &idx, &name);
1166
0
    if (result == 0 || result == 1)
1167
        /* if 0, error has already been set, if 1, iterator is empty */
1168
0
        return NULL;
1169
0
    else {
1170
0
        PyObject* result = NULL;
1171
0
        PyObject* is_attr_obj = NULL;
1172
0
        PyObject* obj = NULL;
1173
1174
0
        is_attr_obj = PyBool_FromLong(is_attr);
1175
0
        if (is_attr_obj == NULL)
1176
0
            goto error;
1177
1178
        /* either an integer or a string */
1179
0
        if (idx != -1)
1180
0
            obj = PyLong_FromSsize_t(idx);
1181
0
        else
1182
0
            obj = SubString_new_object(&name);
1183
0
        if (obj == NULL)
1184
0
            goto error;
1185
1186
        /* return a tuple of values */
1187
0
        return _PyTuple_FromPairSteal(is_attr_obj, obj);
1188
1189
0
    error:
1190
0
        Py_XDECREF(is_attr_obj);
1191
0
        Py_XDECREF(obj);
1192
0
        return result;
1193
0
    }
1194
0
}
1195
1196
static PyMethodDef fieldnameiter_methods[] = {
1197
    {NULL,              NULL}           /* sentinel */
1198
};
1199
1200
static PyTypeObject PyFieldNameIter_Type = {
1201
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
1202
    "fieldnameiterator",                /* tp_name */
1203
    sizeof(fieldnameiterobject),        /* tp_basicsize */
1204
    0,                                  /* tp_itemsize */
1205
    /* methods */
1206
    fieldnameiter_dealloc,              /* tp_dealloc */
1207
    0,                                  /* tp_vectorcall_offset */
1208
    0,                                  /* tp_getattr */
1209
    0,                                  /* tp_setattr */
1210
    0,                                  /* tp_as_async */
1211
    0,                                  /* tp_repr */
1212
    0,                                  /* tp_as_number */
1213
    0,                                  /* tp_as_sequence */
1214
    0,                                  /* tp_as_mapping */
1215
    0,                                  /* tp_hash */
1216
    0,                                  /* tp_call */
1217
    0,                                  /* tp_str */
1218
    PyObject_GenericGetAttr,            /* tp_getattro */
1219
    0,                                  /* tp_setattro */
1220
    0,                                  /* tp_as_buffer */
1221
    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1222
    0,                                  /* tp_doc */
1223
    0,                                  /* tp_traverse */
1224
    0,                                  /* tp_clear */
1225
    0,                                  /* tp_richcompare */
1226
    0,                                  /* tp_weaklistoffset */
1227
    PyObject_SelfIter,                  /* tp_iter */
1228
    fieldnameiter_next,                 /* tp_iternext */
1229
    fieldnameiter_methods,              /* tp_methods */
1230
    0};
1231
1232
/* unicode_formatter_field_name_split is used to implement
1233
   string.Formatter.vformat.  it takes a PEP 3101 "field name", and
1234
   returns a tuple of (first, rest): "first", the part before the
1235
   first '.' or '['; and "rest", an iterator for the rest of the field
1236
   name.  it's a wrapper around stringlib/string_format.h's
1237
   field_name_split.  The iterator it returns is a
1238
   FieldNameIterator */
1239
static PyObject *
1240
formatter_field_name_split(PyObject *Py_UNUSED(module), PyObject *self)
1241
0
{
1242
0
    SubString first;
1243
0
    Py_ssize_t first_idx;
1244
0
    fieldnameiterobject *it;
1245
1246
0
    PyObject *first_obj = NULL;
1247
0
    PyObject *result = NULL;
1248
1249
0
    if (!PyUnicode_Check(self)) {
1250
0
        PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1251
0
        return NULL;
1252
0
    }
1253
1254
0
    it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type);
1255
0
    if (it == NULL)
1256
0
        return NULL;
1257
1258
    /* take ownership, give the object to the iterator.  this is
1259
       just to keep the field_name alive */
1260
0
    it->str = Py_NewRef(self);
1261
1262
    /* Pass in auto_number = NULL. We'll return an empty string for
1263
       first_obj in that case. */
1264
0
    if (!field_name_split((PyObject*)self, 0, PyUnicode_GET_LENGTH(self),
1265
0
                          &first, &first_idx, &it->it_field, NULL))
1266
0
        goto error;
1267
1268
    /* first becomes an integer, if possible; else a string */
1269
0
    if (first_idx != -1)
1270
0
        first_obj = PyLong_FromSsize_t(first_idx);
1271
0
    else
1272
        /* convert "first" into a string object */
1273
0
        first_obj = SubString_new_object(&first);
1274
0
    if (first_obj == NULL)
1275
0
        goto error;
1276
1277
    /* return a tuple of values */
1278
0
    return _PyTuple_FromPairSteal(first_obj, (PyObject *)it);
1279
1280
0
error:
1281
0
    Py_XDECREF(it);
1282
0
    Py_XDECREF(first_obj);
1283
0
    return result;
1284
0
}