Coverage Report

Created: 2026-04-12 06:54

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Objects/stringlib/unicode_format.h
Line
Count
Source
1
/*
2
    unicode_format.h -- implementation of str.format().
3
*/
4
5
#include "pycore_complexobject.h" // _PyComplex_FormatAdvancedWriter()
6
#include "pycore_floatobject.h"   // _PyFloat_FormatAdvancedWriter()
7
#include "pycore_tuple.h"         // _PyTuple_FromPairSteal
8
9
/************************************************************************/
10
/***********   Global data structures and forward declarations  *********/
11
/************************************************************************/
12
13
/*
14
   A SubString consists of the characters between two string or
15
   unicode pointers.
16
*/
17
typedef struct {
18
    PyObject *str; /* borrowed reference */
19
    Py_ssize_t start, end;
20
} SubString;
21
22
23
typedef enum {
24
    ANS_INIT,
25
    ANS_AUTO,
26
    ANS_MANUAL
27
} AutoNumberState;   /* Keep track if we're auto-numbering fields */
28
29
/* Keeps track of our auto-numbering state, and which number field we're on */
30
typedef struct {
31
    AutoNumberState an_state;
32
    int an_field_number;
33
} AutoNumber;
34
35
36
/* forward declaration for recursion */
37
static PyObject *
38
build_string(SubString *input, PyObject *args, PyObject *kwargs,
39
             int recursion_depth, AutoNumber *auto_number);
40
41
42
43
/************************************************************************/
44
/**************************  Utility  functions  ************************/
45
/************************************************************************/
46
47
static void
48
AutoNumber_Init(AutoNumber *auto_number)
49
8.60M
{
50
8.60M
    auto_number->an_state = ANS_INIT;
51
8.60M
    auto_number->an_field_number = 0;
52
8.60M
}
53
54
/* fill in a SubString from a pointer and length */
55
Py_LOCAL_INLINE(void)
56
SubString_init(SubString *str, PyObject *s, Py_ssize_t start, Py_ssize_t end)
57
163M
{
58
163M
    str->str = s;
59
163M
    str->start = start;
60
163M
    str->end = end;
61
163M
}
62
63
/* return a new string.  if str->str is NULL, return None */
64
Py_LOCAL_INLINE(PyObject *)
65
SubString_new_object(SubString *str)
66
46.5k
{
67
46.5k
    if (str->str == NULL)
68
0
        Py_RETURN_NONE;
69
46.5k
    return PyUnicode_Substring(str->str, str->start, str->end);
70
46.5k
}
71
72
/* return a new string.  if str->str is NULL, return a new empty string */
73
Py_LOCAL_INLINE(PyObject *)
74
SubString_new_object_or_empty(SubString *str)
75
0
{
76
0
    if (str->str == NULL) {
77
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_STR);
78
0
    }
79
0
    return SubString_new_object(str);
80
0
}
81
82
/* Return 1 if an error has been detected switching between automatic
83
   field numbering and manual field specification, else return 0. Set
84
   ValueError on error. */
85
static int
86
autonumber_state_error(AutoNumberState state, int field_name_is_empty)
87
16.1M
{
88
16.1M
    if (state == ANS_MANUAL) {
89
432
        if (field_name_is_empty) {
90
0
            PyErr_SetString(PyExc_ValueError, "cannot switch from "
91
0
                            "manual field specification to "
92
0
                            "automatic field numbering");
93
0
            return 1;
94
0
        }
95
432
    }
96
16.1M
    else {
97
16.1M
        if (!field_name_is_empty) {
98
0
            PyErr_SetString(PyExc_ValueError, "cannot switch from "
99
0
                            "automatic field numbering to "
100
0
                            "manual field specification");
101
0
            return 1;
102
0
        }
103
16.1M
    }
104
16.1M
    return 0;
105
16.1M
}
106
107
108
/************************************************************************/
109
/***********  Format string parsing -- integers and identifiers *********/
110
/************************************************************************/
111
112
static Py_ssize_t
113
get_integer(const SubString *str)
114
16.2M
{
115
16.2M
    Py_ssize_t accumulator = 0;
116
16.2M
    Py_ssize_t digitval;
117
16.2M
    Py_ssize_t i;
118
119
    /* empty string is an error */
120
16.2M
    if (str->start >= str->end)
121
16.1M
        return -1;
122
123
47.3k
    for (i = str->start; i < str->end; i++) {
124
46.9k
        digitval = Py_UNICODE_TODECIMAL(PyUnicode_READ_CHAR(str->str, i));
125
46.9k
        if (digitval < 0)
126
46.5k
            return -1;
127
        /*
128
           Detect possible overflow before it happens:
129
130
              accumulator * 10 + digitval > PY_SSIZE_T_MAX if and only if
131
              accumulator > (PY_SSIZE_T_MAX - digitval) / 10.
132
        */
133
432
        if (accumulator > (PY_SSIZE_T_MAX - digitval) / 10) {
134
0
            PyErr_Format(PyExc_ValueError,
135
0
                         "Too many decimal digits in format string");
136
0
            return -1;
137
0
        }
138
432
        accumulator = accumulator * 10 + digitval;
139
432
    }
140
432
    return accumulator;
141
46.9k
}
142
143
/************************************************************************/
144
/******** Functions to get field objects and specification strings ******/
145
/************************************************************************/
146
147
/* do the equivalent of obj.name */
148
static PyObject *
149
getattr(PyObject *obj, SubString *name)
150
4
{
151
4
    PyObject *newobj;
152
4
    PyObject *str = SubString_new_object(name);
153
4
    if (str == NULL)
154
0
        return NULL;
155
4
    newobj = PyObject_GetAttr(obj, str);
156
4
    Py_DECREF(str);
157
4
    return newobj;
158
4
}
159
160
/* do the equivalent of obj[idx], where obj is a sequence */
161
static PyObject *
162
getitem_sequence(PyObject *obj, Py_ssize_t idx)
163
0
{
164
0
    return PySequence_GetItem(obj, idx);
165
0
}
166
167
/* do the equivalent of obj[idx], where obj is not a sequence */
168
static PyObject *
169
getitem_idx(PyObject *obj, Py_ssize_t idx)
170
0
{
171
0
    PyObject *newobj;
172
0
    PyObject *idx_obj = PyLong_FromSsize_t(idx);
173
0
    if (idx_obj == NULL)
174
0
        return NULL;
175
0
    newobj = PyObject_GetItem(obj, idx_obj);
176
0
    Py_DECREF(idx_obj);
177
0
    return newobj;
178
0
}
179
180
/* do the equivalent of obj[name] */
181
static PyObject *
182
getitem_str(PyObject *obj, SubString *name)
183
0
{
184
0
    PyObject *newobj;
185
0
    PyObject *str = SubString_new_object(name);
186
0
    if (str == NULL)
187
0
        return NULL;
188
0
    newobj = PyObject_GetItem(obj, str);
189
0
    Py_DECREF(str);
190
0
    return newobj;
191
0
}
192
193
typedef struct {
194
    /* the entire string we're parsing.  we assume that someone else
195
       is managing its lifetime, and that it will exist for the
196
       lifetime of the iterator.  can be empty */
197
    SubString str;
198
199
    /* index to where we are inside field_name */
200
    Py_ssize_t index;
201
} FieldNameIterator;
202
203
204
static int
205
FieldNameIterator_init(FieldNameIterator *self, PyObject *s,
206
                       Py_ssize_t start, Py_ssize_t end)
207
16.2M
{
208
16.2M
    SubString_init(&self->str, s, start, end);
209
16.2M
    self->index = start;
210
16.2M
    return 1;
211
16.2M
}
212
213
static int
214
_FieldNameIterator_attr(FieldNameIterator *self, SubString *name)
215
4
{
216
4
    Py_UCS4 c;
217
218
4
    name->str = self->str.str;
219
4
    name->start = self->index;
220
221
    /* return everything until '.' or '[' */
222
52
    while (self->index < self->str.end) {
223
48
        c = PyUnicode_READ_CHAR(self->str.str, self->index++);
224
48
        switch (c) {
225
0
        case '[':
226
0
        case '.':
227
            /* backup so that we this character will be seen next time */
228
0
            self->index--;
229
0
            break;
230
48
        default:
231
48
            continue;
232
48
        }
233
0
        break;
234
48
    }
235
    /* end of string is okay */
236
4
    name->end = self->index;
237
4
    return 1;
238
4
}
239
240
static int
241
_FieldNameIterator_item(FieldNameIterator *self, SubString *name)
242
0
{
243
0
    int bracket_seen = 0;
244
0
    Py_UCS4 c;
245
246
0
    name->str = self->str.str;
247
0
    name->start = self->index;
248
249
    /* return everything until ']' */
250
0
    while (self->index < self->str.end) {
251
0
        c = PyUnicode_READ_CHAR(self->str.str, self->index++);
252
0
        switch (c) {
253
0
        case ']':
254
0
            bracket_seen = 1;
255
0
            break;
256
0
        default:
257
0
            continue;
258
0
        }
259
0
        break;
260
0
    }
261
    /* make sure we ended with a ']' */
262
0
    if (!bracket_seen) {
263
0
        PyErr_SetString(PyExc_ValueError, "Missing ']' in format string");
264
0
        return 0;
265
0
    }
266
267
    /* end of string is okay */
268
    /* don't include the ']' */
269
0
    name->end = self->index-1;
270
0
    return 1;
271
0
}
272
273
/* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */
274
static int
275
FieldNameIterator_next(FieldNameIterator *self, int *is_attribute,
276
                       Py_ssize_t *name_idx, SubString *name)
277
16.2M
{
278
    /* check at end of input */
279
16.2M
    if (self->index >= self->str.end)
280
16.2M
        return 1;
281
282
4
    switch (PyUnicode_READ_CHAR(self->str.str, self->index++)) {
283
4
    case '.':
284
4
        *is_attribute = 1;
285
4
        if (_FieldNameIterator_attr(self, name) == 0)
286
0
            return 0;
287
4
        *name_idx = -1;
288
4
        break;
289
0
    case '[':
290
0
        *is_attribute = 0;
291
0
        if (_FieldNameIterator_item(self, name) == 0)
292
0
            return 0;
293
0
        *name_idx = get_integer(name);
294
0
        if (*name_idx == -1 && PyErr_Occurred())
295
0
            return 0;
296
0
        break;
297
0
    default:
298
        /* Invalid character follows ']' */
299
0
        PyErr_SetString(PyExc_ValueError, "Only '.' or '[' may "
300
0
                        "follow ']' in format field specifier");
301
0
        return 0;
302
4
    }
303
304
    /* empty string is an error */
305
4
    if (name->start == name->end) {
306
0
        PyErr_SetString(PyExc_ValueError, "Empty attribute in format string");
307
0
        return 0;
308
0
    }
309
310
4
    return 2;
311
4
}
312
313
314
/* input: field_name
315
   output: 'first' points to the part before the first '[' or '.'
316
           'first_idx' is -1 if 'first' is not an integer, otherwise
317
                       it's the value of first converted to an integer
318
           'rest' is an iterator to return the rest
319
*/
320
static int
321
field_name_split(PyObject *str, Py_ssize_t start, Py_ssize_t end, SubString *first,
322
                 Py_ssize_t *first_idx, FieldNameIterator *rest,
323
                 AutoNumber *auto_number)
324
16.2M
{
325
16.2M
    Py_UCS4 c;
326
16.2M
    Py_ssize_t i = start;
327
16.2M
    int field_name_is_empty;
328
16.2M
    int using_numeric_index;
329
330
    /* find the part up until the first '.' or '[' */
331
16.5M
    while (i < end) {
332
337k
        switch (c = PyUnicode_READ_CHAR(str, i++)) {
333
0
        case '[':
334
4
        case '.':
335
            /* backup so that we this character is available to the
336
               "rest" iterator */
337
4
            i--;
338
4
            break;
339
337k
        default:
340
337k
            continue;
341
337k
        }
342
4
        break;
343
337k
    }
344
345
    /* set up the return values */
346
16.2M
    SubString_init(first, str, start, i);
347
16.2M
    FieldNameIterator_init(rest, str, i, end);
348
349
    /* see if "first" is an integer, in which case it's used as an index */
350
16.2M
    *first_idx = get_integer(first);
351
16.2M
    if (*first_idx == -1 && PyErr_Occurred())
352
0
        return 0;
353
354
16.2M
    field_name_is_empty = first->start >= first->end;
355
356
    /* If the field name is omitted or if we have a numeric index
357
       specified, then we're doing numeric indexing into args. */
358
16.2M
    using_numeric_index = field_name_is_empty || *first_idx != -1;
359
360
    /* We always get here exactly one time for each field we're
361
       processing. And we get here in field order (counting by left
362
       braces). So this is the perfect place to handle automatic field
363
       numbering if the field name is omitted. */
364
365
    /* Check if we need to do the auto-numbering. It's not needed if
366
       we're called from string.Format routines, because it's handled
367
       in that class by itself. */
368
16.2M
    if (auto_number) {
369
        /* Initialize our auto numbering state if this is the first
370
           time we're either auto-numbering or manually numbering. */
371
16.2M
        if (auto_number->an_state == ANS_INIT && using_numeric_index)
372
8.59M
            auto_number->an_state = field_name_is_empty ?
373
8.59M
                ANS_AUTO : ANS_MANUAL;
374
375
        /* Make sure our state is consistent with what we're doing
376
           this time through. Only check if we're using a numeric
377
           index. */
378
16.2M
        if (using_numeric_index)
379
16.1M
            if (autonumber_state_error(auto_number->an_state,
380
16.1M
                                       field_name_is_empty))
381
0
                return 0;
382
        /* Zero length field means we want to do auto-numbering of the
383
           fields. */
384
16.2M
        if (field_name_is_empty)
385
16.1M
            *first_idx = (auto_number->an_field_number)++;
386
16.2M
    }
387
388
16.2M
    return 1;
389
16.2M
}
390
391
392
/*
393
    get_field_object returns the object inside {}, before the
394
    format_spec.  It handles getindex and getattr lookups and consumes
395
    the entire input string.
396
*/
397
static PyObject *
398
get_field_object(SubString *input, PyObject *args, PyObject *kwargs,
399
                 AutoNumber *auto_number)
400
16.2M
{
401
16.2M
    PyObject *obj = NULL;
402
16.2M
    int ok;
403
16.2M
    int is_attribute;
404
16.2M
    SubString name;
405
16.2M
    SubString first;
406
16.2M
    Py_ssize_t index;
407
16.2M
    FieldNameIterator rest;
408
409
16.2M
    if (!field_name_split(input->str, input->start, input->end, &first,
410
16.2M
                          &index, &rest, auto_number)) {
411
0
        goto error;
412
0
    }
413
414
16.2M
    if (index == -1) {
415
        /* look up in kwargs */
416
46.5k
        PyObject *key = SubString_new_object(&first);
417
46.5k
        if (key == NULL) {
418
0
            goto error;
419
0
        }
420
46.5k
        if (kwargs == NULL) {
421
0
            PyErr_SetObject(PyExc_KeyError, key);
422
0
            Py_DECREF(key);
423
0
            goto error;
424
0
        }
425
        /* Use PyObject_GetItem instead of PyDict_GetItem because this
426
           code is no longer just used with kwargs. It might be passed
427
           a non-dict when called through format_map. */
428
46.5k
        obj = PyObject_GetItem(kwargs, key);
429
46.5k
        Py_DECREF(key);
430
46.5k
        if (obj == NULL) {
431
0
            goto error;
432
0
        }
433
46.5k
    }
434
16.1M
    else {
435
        /* If args is NULL, we have a format string with a positional field
436
           with only kwargs to retrieve it from. This can only happen when
437
           used with format_map(), where positional arguments are not
438
           allowed. */
439
16.1M
        if (args == NULL) {
440
0
            PyErr_SetString(PyExc_ValueError, "Format string contains "
441
0
                            "positional fields");
442
0
            goto error;
443
0
        }
444
445
        /* look up in args */
446
16.1M
        obj = PySequence_GetItem(args, index);
447
16.1M
        if (obj == NULL) {
448
0
            PyErr_Format(PyExc_IndexError,
449
0
                         "Replacement index %zd out of range for positional "
450
0
                         "args tuple",
451
0
                         index);
452
0
             goto error;
453
0
        }
454
16.1M
    }
455
456
    /* iterate over the rest of the field_name */
457
16.2M
    while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index,
458
16.2M
                                        &name)) == 2) {
459
4
        PyObject *tmp;
460
461
4
        if (is_attribute)
462
            /* getattr lookup "." */
463
4
            tmp = getattr(obj, &name);
464
0
        else
465
            /* getitem lookup "[]" */
466
0
            if (index == -1)
467
0
                tmp = getitem_str(obj, &name);
468
0
            else
469
0
                if (PySequence_Check(obj))
470
0
                    tmp = getitem_sequence(obj, index);
471
0
                else
472
                    /* not a sequence */
473
0
                    tmp = getitem_idx(obj, index);
474
4
        if (tmp == NULL)
475
0
            goto error;
476
477
        /* assign to obj */
478
4
        Py_SETREF(obj, tmp);
479
4
    }
480
    /* end of iterator, this is the non-error case */
481
16.2M
    if (ok == 1)
482
16.2M
        return obj;
483
0
error:
484
0
    Py_XDECREF(obj);
485
0
    return NULL;
486
16.2M
}
487
488
/************************************************************************/
489
/*****************  Field rendering functions  **************************/
490
/************************************************************************/
491
492
/*
493
    render_field() is the main function in this section.  It takes the
494
    field object and field specification string generated by
495
    get_field_and_spec, and renders the field into the output string.
496
497
    render_field calls fieldobj.__format__(format_spec) method, and
498
    appends to the output.
499
*/
500
static int
501
render_field(PyObject *fieldobj, SubString *format_spec, _PyUnicodeWriter *writer)
502
16.2M
{
503
16.2M
    int ok = 0;
504
16.2M
    PyObject *result = NULL;
505
16.2M
    PyObject *format_spec_object = NULL;
506
16.2M
    int (*formatter) (_PyUnicodeWriter*, PyObject *, PyObject *, Py_ssize_t, Py_ssize_t) = NULL;
507
16.2M
    int err;
508
509
    /* If we know the type exactly, skip the lookup of __format__ and just
510
       call the formatter directly. */
511
16.2M
    if (PyUnicode_CheckExact(fieldobj))
512
15.9M
        formatter = _PyUnicode_FormatAdvancedWriter;
513
221k
    else if (PyLong_CheckExact(fieldobj))
514
66.0k
        formatter = _PyLong_FormatAdvancedWriter;
515
155k
    else if (PyFloat_CheckExact(fieldobj))
516
0
        formatter = _PyFloat_FormatAdvancedWriter;
517
155k
    else if (PyComplex_CheckExact(fieldobj))
518
0
        formatter = _PyComplex_FormatAdvancedWriter;
519
520
16.2M
    if (formatter) {
521
        /* we know exactly which formatter will be called when __format__ is
522
           looked up, so call it directly, instead. */
523
16.0M
        err = formatter(writer, fieldobj, format_spec->str,
524
16.0M
                        format_spec->start, format_spec->end);
525
16.0M
        return (err == 0);
526
16.0M
    }
527
155k
    else {
528
        /* We need to create an object out of the pointers we have, because
529
           __format__ takes a string/unicode object for format_spec. */
530
155k
        if (format_spec->str)
531
0
            format_spec_object = PyUnicode_Substring(format_spec->str,
532
0
                                                     format_spec->start,
533
0
                                                     format_spec->end);
534
155k
        else
535
155k
            format_spec_object = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
536
155k
        if (format_spec_object == NULL)
537
0
            goto done;
538
539
155k
        result = PyObject_Format(fieldobj, format_spec_object);
540
155k
    }
541
155k
    if (result == NULL)
542
1
        goto done;
543
544
155k
    if (_PyUnicodeWriter_WriteStr(writer, result) == -1)
545
0
        goto done;
546
155k
    ok = 1;
547
548
155k
done:
549
155k
    Py_XDECREF(format_spec_object);
550
155k
    Py_XDECREF(result);
551
155k
    return ok;
552
155k
}
553
554
static int
555
parse_field(SubString *str, SubString *field_name, SubString *format_spec,
556
            int *format_spec_needs_expanding, Py_UCS4 *conversion)
557
16.2M
{
558
    /* Note this function works if the field name is zero length,
559
       which is good.  Zero length field names are handled later, in
560
       field_name_split. */
561
562
16.2M
    Py_UCS4 c = 0;
563
564
    /* initialize these, as they may be empty */
565
16.2M
    *conversion = '\0';
566
16.2M
    SubString_init(format_spec, NULL, 0, 0);
567
568
    /* Search for the field name.  it's terminated by the end of
569
       the string, or a ':' or '!' */
570
16.2M
    field_name->str = str->str;
571
16.2M
    field_name->start = str->start;
572
16.5M
    while (str->start < str->end) {
573
16.5M
        switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
574
0
        case '{':
575
0
            PyErr_SetString(PyExc_ValueError, "unexpected '{' in field name");
576
0
            return 0;
577
0
        case '[':
578
0
            for (; str->start < str->end; str->start++)
579
0
                if (PyUnicode_READ_CHAR(str->str, str->start) == ']')
580
0
                    break;
581
0
            continue;
582
15.6M
        case '}':
583
15.6M
        case ':':
584
16.2M
        case '!':
585
16.2M
            break;
586
337k
        default:
587
337k
            continue;
588
16.5M
        }
589
16.2M
        break;
590
16.5M
    }
591
592
16.2M
    field_name->end = str->start - 1;
593
16.2M
    if (c == '!' || c == ':') {
594
529k
        Py_ssize_t count;
595
        /* we have a format specifier and/or a conversion */
596
        /* don't include the last character */
597
598
        /* see if there's a conversion specifier */
599
529k
        if (c == '!') {
600
            /* there must be another character present */
601
529k
            if (str->start >= str->end) {
602
0
                PyErr_SetString(PyExc_ValueError,
603
0
                                "end of string while looking for conversion "
604
0
                                "specifier");
605
0
                return 0;
606
0
            }
607
529k
            *conversion = PyUnicode_READ_CHAR(str->str, str->start++);
608
609
529k
            if (str->start < str->end) {
610
529k
                c = PyUnicode_READ_CHAR(str->str, str->start++);
611
529k
                if (c == '}')
612
529k
                    return 1;
613
0
                if (c != ':') {
614
0
                    PyErr_SetString(PyExc_ValueError,
615
0
                                    "expected ':' after conversion specifier");
616
0
                    return 0;
617
0
                }
618
0
            }
619
529k
        }
620
192
        format_spec->str = str->str;
621
192
        format_spec->start = str->start;
622
192
        count = 1;
623
768
        while (str->start < str->end) {
624
768
            switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
625
0
            case '{':
626
0
                *format_spec_needs_expanding = 1;
627
0
                count++;
628
0
                break;
629
192
            case '}':
630
192
                count--;
631
192
                if (count == 0) {
632
192
                    format_spec->end = str->start - 1;
633
192
                    return 1;
634
192
                }
635
0
                break;
636
576
            default:
637
576
                break;
638
768
            }
639
768
        }
640
641
0
        PyErr_SetString(PyExc_ValueError, "unmatched '{' in format spec");
642
0
        return 0;
643
192
    }
644
15.6M
    else if (c != '}') {
645
0
        PyErr_SetString(PyExc_ValueError, "expected '}' before end of string");
646
0
        return 0;
647
0
    }
648
649
15.6M
    return 1;
650
16.2M
}
651
652
/************************************************************************/
653
/******* Output string allocation and escape-to-markup processing  ******/
654
/************************************************************************/
655
656
/* MarkupIterator breaks the string into pieces of either literal
657
   text, or things inside {} that need to be marked up.  it is
658
   designed to make it easy to wrap a Python iterator around it, for
659
   use with the Formatter class */
660
661
typedef struct {
662
    SubString str;
663
} MarkupIterator;
664
665
static int
666
MarkupIterator_init(MarkupIterator *self, PyObject *str,
667
                    Py_ssize_t start, Py_ssize_t end)
668
8.60M
{
669
8.60M
    SubString_init(&self->str, str, start, end);
670
8.60M
    return 1;
671
8.60M
}
672
673
/* returns 0 on error, 1 on non-error termination, and 2 if it got a
674
   string (or something to be expanded) */
675
static int
676
MarkupIterator_next(MarkupIterator *self, SubString *literal,
677
                    int *field_present, SubString *field_name,
678
                    SubString *format_spec, Py_UCS4 *conversion,
679
                    int *format_spec_needs_expanding)
680
32.6M
{
681
32.6M
    int at_end;
682
32.6M
    Py_UCS4 c = 0;
683
32.6M
    Py_ssize_t start;
684
32.6M
    Py_ssize_t len;
685
32.6M
    int markup_follows = 0;
686
687
    /* initialize all of the output variables */
688
32.6M
    SubString_init(literal, NULL, 0, 0);
689
32.6M
    SubString_init(field_name, NULL, 0, 0);
690
32.6M
    SubString_init(format_spec, NULL, 0, 0);
691
32.6M
    *conversion = '\0';
692
32.6M
    *format_spec_needs_expanding = 0;
693
32.6M
    *field_present = 0;
694
695
    /* No more input, end of iterator.  This is the normal exit
696
       path. */
697
32.6M
    if (self->str.start >= self->str.end)
698
8.60M
        return 1;
699
700
24.0M
    start = self->str.start;
701
702
    /* First read any literal text. Read until the end of string, an
703
       escaped '{' or '}', or an unescaped '{'.  In order to never
704
       allocate memory and so I can just pass pointers around, if
705
       there's an escaped '{' or '}' then we'll return the literal
706
       including the brace, but no format object.  The next time
707
       through, we'll return the rest of the literal, skipping past
708
       the second consecutive brace. */
709
72.6M
    while (self->str.start < self->str.end) {
710
64.8M
        switch (c = PyUnicode_READ_CHAR(self->str.str, self->str.start++)) {
711
16.2M
        case '{':
712
16.2M
        case '}':
713
16.2M
            markup_follows = 1;
714
16.2M
            break;
715
48.6M
        default:
716
48.6M
            continue;
717
64.8M
        }
718
16.2M
        break;
719
64.8M
    }
720
721
24.0M
    at_end = self->str.start >= self->str.end;
722
24.0M
    len = self->str.start - start;
723
724
24.0M
    if ((c == '}') && (at_end ||
725
0
                       (c != PyUnicode_READ_CHAR(self->str.str,
726
0
                                                 self->str.start)))) {
727
0
        PyErr_SetString(PyExc_ValueError, "Single '}' encountered "
728
0
                        "in format string");
729
0
        return 0;
730
0
    }
731
24.0M
    if (at_end && c == '{') {
732
0
        PyErr_SetString(PyExc_ValueError, "Single '{' encountered "
733
0
                        "in format string");
734
0
        return 0;
735
0
    }
736
24.0M
    if (!at_end) {
737
16.2M
        if (c == PyUnicode_READ_CHAR(self->str.str, self->str.start)) {
738
            /* escaped } or {, skip it in the input.  there is no
739
               markup object following us, just this literal text */
740
0
            self->str.start++;
741
0
            markup_follows = 0;
742
0
        }
743
16.2M
        else
744
16.2M
            len--;
745
16.2M
    }
746
747
    /* record the literal text */
748
24.0M
    literal->str = self->str.str;
749
24.0M
    literal->start = start;
750
24.0M
    literal->end = start + len;
751
752
24.0M
    if (!markup_follows)
753
7.80M
        return 2;
754
755
    /* this is markup; parse the field */
756
16.2M
    *field_present = 1;
757
16.2M
    if (!parse_field(&self->str, field_name, format_spec,
758
16.2M
                     format_spec_needs_expanding, conversion))
759
0
        return 0;
760
16.2M
    return 2;
761
16.2M
}
762
763
764
/* do the !r or !s conversion on obj */
765
static PyObject *
766
do_conversion(PyObject *obj, Py_UCS4 conversion)
767
529k
{
768
    /* XXX in pre-3.0, do we need to convert this to unicode, since it
769
       might have returned a string? */
770
529k
    switch (conversion) {
771
529k
    case 'r':
772
529k
        return PyObject_Repr(obj);
773
0
    case 's':
774
0
        return PyObject_Str(obj);
775
0
    case 'a':
776
0
        return PyObject_ASCII(obj);
777
0
    default:
778
0
        if (conversion > 32 && conversion < 127) {
779
                /* It's the ASCII subrange; casting to char is safe
780
                   (assuming the execution character set is an ASCII
781
                   superset). */
782
0
                PyErr_Format(PyExc_ValueError,
783
0
                     "Unknown conversion specifier %c",
784
0
                     (char)conversion);
785
0
        } else
786
0
                PyErr_Format(PyExc_ValueError,
787
0
                     "Unknown conversion specifier \\x%x",
788
0
                     (unsigned int)conversion);
789
0
        return NULL;
790
529k
    }
791
529k
}
792
793
/* given:
794
795
   {field_name!conversion:format_spec}
796
797
   compute the result and write it to output.
798
   format_spec_needs_expanding is an optimization.  if it's false,
799
   just output the string directly, otherwise recursively expand the
800
   format_spec string.
801
802
   field_name is allowed to be zero length, in which case we
803
   are doing auto field numbering.
804
*/
805
806
static int
807
output_markup(SubString *field_name, SubString *format_spec,
808
              int format_spec_needs_expanding, Py_UCS4 conversion,
809
              _PyUnicodeWriter *writer, PyObject *args, PyObject *kwargs,
810
              int recursion_depth, AutoNumber *auto_number)
811
16.2M
{
812
16.2M
    PyObject *tmp = NULL;
813
16.2M
    PyObject *fieldobj = NULL;
814
16.2M
    SubString expanded_format_spec;
815
16.2M
    SubString *actual_format_spec;
816
16.2M
    int result = 0;
817
818
    /* convert field_name to an object */
819
16.2M
    fieldobj = get_field_object(field_name, args, kwargs, auto_number);
820
16.2M
    if (fieldobj == NULL)
821
0
        goto done;
822
823
16.2M
    if (conversion != '\0') {
824
529k
        tmp = do_conversion(fieldobj, conversion);
825
529k
        if (tmp == NULL)
826
0
            goto done;
827
828
        /* do the assignment, transferring ownership: fieldobj = tmp */
829
529k
        Py_SETREF(fieldobj, tmp);
830
529k
        tmp = NULL;
831
529k
    }
832
833
    /* if needed, recursively compute the format_spec */
834
16.2M
    if (format_spec_needs_expanding) {
835
0
        tmp = build_string(format_spec, args, kwargs, recursion_depth-1,
836
0
                           auto_number);
837
0
        if (tmp == NULL)
838
0
            goto done;
839
840
        /* note that in the case we're expanding the format string,
841
           tmp must be kept around until after the call to
842
           render_field. */
843
0
        SubString_init(&expanded_format_spec, tmp, 0, PyUnicode_GET_LENGTH(tmp));
844
0
        actual_format_spec = &expanded_format_spec;
845
0
    }
846
16.2M
    else
847
16.2M
        actual_format_spec = format_spec;
848
849
16.2M
    if (render_field(fieldobj, actual_format_spec, writer) == 0)
850
1
        goto done;
851
852
16.2M
    result = 1;
853
854
16.2M
done:
855
16.2M
    Py_XDECREF(fieldobj);
856
16.2M
    Py_XDECREF(tmp);
857
858
16.2M
    return result;
859
16.2M
}
860
861
/*
862
    do_markup is the top-level loop for the format() method.  It
863
    searches through the format string for escapes to markup codes, and
864
    calls other functions to move non-markup text to the output,
865
    and to perform the markup to the output.
866
*/
867
static int
868
do_markup(SubString *input, PyObject *args, PyObject *kwargs,
869
          _PyUnicodeWriter *writer, int recursion_depth, AutoNumber *auto_number)
870
8.60M
{
871
8.60M
    MarkupIterator iter;
872
8.60M
    int format_spec_needs_expanding;
873
8.60M
    int result;
874
8.60M
    int field_present;
875
8.60M
    SubString literal;
876
8.60M
    SubString field_name;
877
8.60M
    SubString format_spec;
878
8.60M
    Py_UCS4 conversion;
879
880
8.60M
    MarkupIterator_init(&iter, input->str, input->start, input->end);
881
32.6M
    while ((result = MarkupIterator_next(&iter, &literal, &field_present,
882
32.6M
                                         &field_name, &format_spec,
883
32.6M
                                         &conversion,
884
32.6M
                                         &format_spec_needs_expanding)) == 2) {
885
24.0M
        if (literal.end != literal.start) {
886
16.4M
            if (!field_present && iter.str.start == iter.str.end)
887
7.80M
                writer->overallocate = 0;
888
16.4M
            if (_PyUnicodeWriter_WriteSubstring(writer, literal.str,
889
16.4M
                                                literal.start, literal.end) < 0)
890
0
                return 0;
891
16.4M
        }
892
893
24.0M
        if (field_present) {
894
16.2M
            if (iter.str.start == iter.str.end)
895
800k
                writer->overallocate = 0;
896
16.2M
            if (!output_markup(&field_name, &format_spec,
897
16.2M
                               format_spec_needs_expanding, conversion, writer,
898
16.2M
                               args, kwargs, recursion_depth, auto_number))
899
1
                return 0;
900
16.2M
        }
901
24.0M
    }
902
8.60M
    return result;
903
8.60M
}
904
905
906
/*
907
    build_string allocates the output string and then
908
    calls do_markup to do the heavy lifting.
909
*/
910
static PyObject *
911
build_string(SubString *input, PyObject *args, PyObject *kwargs,
912
             int recursion_depth, AutoNumber *auto_number)
913
8.60M
{
914
8.60M
    _PyUnicodeWriter writer;
915
916
    /* check the recursion level */
917
8.60M
    if (recursion_depth <= 0) {
918
0
        PyErr_SetString(PyExc_ValueError,
919
0
                        "Max string recursion exceeded");
920
0
        return NULL;
921
0
    }
922
923
8.60M
    _PyUnicodeWriter_Init(&writer);
924
8.60M
    writer.overallocate = 1;
925
8.60M
    writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
926
927
8.60M
    if (!do_markup(input, args, kwargs, &writer, recursion_depth,
928
8.60M
                   auto_number)) {
929
1
        _PyUnicodeWriter_Dealloc(&writer);
930
1
        return NULL;
931
1
    }
932
933
8.60M
    return _PyUnicodeWriter_Finish(&writer);
934
8.60M
}
935
936
/************************************************************************/
937
/*********** main routine ***********************************************/
938
/************************************************************************/
939
940
/* this is the main entry point */
941
static PyObject *
942
do_string_format(PyObject *self, PyObject *args, PyObject *kwargs)
943
8.60M
{
944
8.60M
    SubString input;
945
946
    /* PEP 3101 says only 2 levels, so that
947
       "{0:{1}}".format('abc', 's')            # works
948
       "{0:{1:{2}}}".format('abc', 's', '')    # fails
949
    */
950
8.60M
    int recursion_depth = 2;
951
952
8.60M
    AutoNumber auto_number;
953
8.60M
    AutoNumber_Init(&auto_number);
954
8.60M
    SubString_init(&input, self, 0, PyUnicode_GET_LENGTH(self));
955
8.60M
    return build_string(&input, args, kwargs, recursion_depth, &auto_number);
956
8.60M
}
957
958
static PyObject *
959
do_string_format_map(PyObject *self, PyObject *obj)
960
0
{
961
0
    return do_string_format(self, NULL, obj);
962
0
}
963
964
965
/************************************************************************/
966
/*********** formatteriterator ******************************************/
967
/************************************************************************/
968
969
/* This is used to implement string.Formatter.vparse().  It exists so
970
   Formatter can share code with the built in unicode.format() method.
971
   It's really just a wrapper around MarkupIterator that is callable
972
   from Python. */
973
974
typedef struct {
975
    PyObject_HEAD
976
    PyObject *str;
977
    MarkupIterator it_markup;
978
} formatteriterobject;
979
980
static void
981
formatteriter_dealloc(PyObject *op)
982
0
{
983
0
    formatteriterobject *it = (formatteriterobject*)op;
984
0
    Py_XDECREF(it->str);
985
0
    PyObject_Free(it);
986
0
}
987
988
/* returns a tuple:
989
   (literal, field_name, format_spec, conversion)
990
991
   literal is any literal text to output.  might be zero length
992
   field_name is the string before the ':'.  might be None
993
   format_spec is the string after the ':'.  mibht be None
994
   conversion is either None, or the string after the '!'
995
*/
996
static PyObject *
997
formatteriter_next(PyObject *op)
998
0
{
999
0
    formatteriterobject *it = (formatteriterobject*)op;
1000
0
    SubString literal;
1001
0
    SubString field_name;
1002
0
    SubString format_spec;
1003
0
    Py_UCS4 conversion;
1004
0
    int format_spec_needs_expanding;
1005
0
    int field_present;
1006
0
    int result = MarkupIterator_next(&it->it_markup, &literal, &field_present,
1007
0
                                     &field_name, &format_spec, &conversion,
1008
0
                                     &format_spec_needs_expanding);
1009
1010
    /* all of the SubString objects point into it->str, so no
1011
       memory management needs to be done on them */
1012
0
    assert(0 <= result && result <= 2);
1013
0
    if (result == 0 || result == 1)
1014
        /* if 0, error has already been set, if 1, iterator is empty */
1015
0
        return NULL;
1016
0
    else {
1017
0
        PyObject *literal_str = NULL;
1018
0
        PyObject *field_name_str = NULL;
1019
0
        PyObject *format_spec_str = NULL;
1020
0
        PyObject *conversion_str = NULL;
1021
0
        PyObject *tuple = NULL;
1022
1023
0
        literal_str = SubString_new_object(&literal);
1024
0
        if (literal_str == NULL)
1025
0
            goto done;
1026
1027
0
        field_name_str = SubString_new_object(&field_name);
1028
0
        if (field_name_str == NULL)
1029
0
            goto done;
1030
1031
        /* if field_name is non-zero length, return a string for
1032
           format_spec (even if zero length), else return None */
1033
0
        format_spec_str = (field_present ?
1034
0
                           SubString_new_object_or_empty :
1035
0
                           SubString_new_object)(&format_spec);
1036
0
        if (format_spec_str == NULL)
1037
0
            goto done;
1038
1039
        /* if the conversion is not specified, return a None,
1040
           otherwise create a one length string with the conversion
1041
           character */
1042
0
        if (conversion == '\0') {
1043
0
            conversion_str = Py_NewRef(Py_None);
1044
0
        }
1045
0
        else
1046
0
            conversion_str = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
1047
0
                                                       &conversion, 1);
1048
0
        if (conversion_str == NULL)
1049
0
            goto done;
1050
1051
0
        tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str,
1052
0
                             conversion_str);
1053
0
    done:
1054
0
        Py_XDECREF(literal_str);
1055
0
        Py_XDECREF(field_name_str);
1056
0
        Py_XDECREF(format_spec_str);
1057
0
        Py_XDECREF(conversion_str);
1058
0
        return tuple;
1059
0
    }
1060
0
}
1061
1062
static PyMethodDef formatteriter_methods[] = {
1063
    {NULL,              NULL}           /* sentinel */
1064
};
1065
1066
static PyTypeObject PyFormatterIter_Type = {
1067
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
1068
    "formatteriterator",                /* tp_name */
1069
    sizeof(formatteriterobject),        /* tp_basicsize */
1070
    0,                                  /* tp_itemsize */
1071
    /* methods */
1072
    formatteriter_dealloc,              /* tp_dealloc */
1073
    0,                                  /* tp_vectorcall_offset */
1074
    0,                                  /* tp_getattr */
1075
    0,                                  /* tp_setattr */
1076
    0,                                  /* tp_as_async */
1077
    0,                                  /* tp_repr */
1078
    0,                                  /* tp_as_number */
1079
    0,                                  /* tp_as_sequence */
1080
    0,                                  /* tp_as_mapping */
1081
    0,                                  /* tp_hash */
1082
    0,                                  /* tp_call */
1083
    0,                                  /* tp_str */
1084
    PyObject_GenericGetAttr,            /* tp_getattro */
1085
    0,                                  /* tp_setattro */
1086
    0,                                  /* tp_as_buffer */
1087
    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1088
    0,                                  /* tp_doc */
1089
    0,                                  /* tp_traverse */
1090
    0,                                  /* tp_clear */
1091
    0,                                  /* tp_richcompare */
1092
    0,                                  /* tp_weaklistoffset */
1093
    PyObject_SelfIter,                  /* tp_iter */
1094
    formatteriter_next,                 /* tp_iternext */
1095
    formatteriter_methods,              /* tp_methods */
1096
    0,
1097
};
1098
1099
/* unicode_formatter_parser is used to implement
1100
   string.Formatter.vformat.  it parses a string and returns tuples
1101
   describing the parsed elements.  It's a wrapper around
1102
   stringlib/string_format.h's MarkupIterator */
1103
static PyObject *
1104
formatter_parser(PyObject *Py_UNUSED(module), PyObject *self)
1105
0
{
1106
0
    formatteriterobject *it;
1107
1108
0
    if (!PyUnicode_Check(self)) {
1109
0
        PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1110
0
        return NULL;
1111
0
    }
1112
1113
0
    it = PyObject_New(formatteriterobject, &PyFormatterIter_Type);
1114
0
    if (it == NULL)
1115
0
        return NULL;
1116
1117
    /* take ownership, give the object to the iterator */
1118
0
    it->str = Py_NewRef(self);
1119
1120
    /* initialize the contained MarkupIterator */
1121
0
    MarkupIterator_init(&it->it_markup, (PyObject*)self, 0, PyUnicode_GET_LENGTH(self));
1122
0
    return (PyObject *)it;
1123
0
}
1124
1125
1126
/************************************************************************/
1127
/*********** fieldnameiterator ******************************************/
1128
/************************************************************************/
1129
1130
1131
/* This is used to implement string.Formatter.vparse().  It parses the
1132
   field name into attribute and item values.  It's a Python-callable
1133
   wrapper around FieldNameIterator */
1134
1135
typedef struct {
1136
    PyObject_HEAD
1137
    PyObject *str;
1138
    FieldNameIterator it_field;
1139
} fieldnameiterobject;
1140
1141
static void
1142
fieldnameiter_dealloc(PyObject *op)
1143
0
{
1144
0
    fieldnameiterobject *it = (fieldnameiterobject*)op;
1145
0
    Py_XDECREF(it->str);
1146
0
    PyObject_Free(it);
1147
0
}
1148
1149
/* returns a tuple:
1150
   (is_attr, value)
1151
   is_attr is true if we used attribute syntax (e.g., '.foo')
1152
              false if we used index syntax (e.g., '[foo]')
1153
   value is an integer or string
1154
*/
1155
static PyObject *
1156
fieldnameiter_next(PyObject *op)
1157
0
{
1158
0
    fieldnameiterobject *it = (fieldnameiterobject*)op;
1159
0
    int result;
1160
0
    int is_attr;
1161
0
    Py_ssize_t idx;
1162
0
    SubString name;
1163
1164
0
    result = FieldNameIterator_next(&it->it_field, &is_attr,
1165
0
                                    &idx, &name);
1166
0
    if (result == 0 || result == 1)
1167
        /* if 0, error has already been set, if 1, iterator is empty */
1168
0
        return NULL;
1169
0
    else {
1170
0
        PyObject* result = NULL;
1171
0
        PyObject* is_attr_obj = NULL;
1172
0
        PyObject* obj = NULL;
1173
1174
0
        is_attr_obj = PyBool_FromLong(is_attr);
1175
0
        if (is_attr_obj == NULL)
1176
0
            goto error;
1177
1178
        /* either an integer or a string */
1179
0
        if (idx != -1)
1180
0
            obj = PyLong_FromSsize_t(idx);
1181
0
        else
1182
0
            obj = SubString_new_object(&name);
1183
0
        if (obj == NULL)
1184
0
            goto error;
1185
1186
        /* return a tuple of values */
1187
0
        return _PyTuple_FromPairSteal(is_attr_obj, obj);
1188
1189
0
    error:
1190
0
        Py_XDECREF(is_attr_obj);
1191
0
        Py_XDECREF(obj);
1192
0
        return result;
1193
0
    }
1194
0
}
1195
1196
static PyMethodDef fieldnameiter_methods[] = {
1197
    {NULL,              NULL}           /* sentinel */
1198
};
1199
1200
static PyTypeObject PyFieldNameIter_Type = {
1201
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
1202
    "fieldnameiterator",                /* tp_name */
1203
    sizeof(fieldnameiterobject),        /* tp_basicsize */
1204
    0,                                  /* tp_itemsize */
1205
    /* methods */
1206
    fieldnameiter_dealloc,              /* tp_dealloc */
1207
    0,                                  /* tp_vectorcall_offset */
1208
    0,                                  /* tp_getattr */
1209
    0,                                  /* tp_setattr */
1210
    0,                                  /* tp_as_async */
1211
    0,                                  /* tp_repr */
1212
    0,                                  /* tp_as_number */
1213
    0,                                  /* tp_as_sequence */
1214
    0,                                  /* tp_as_mapping */
1215
    0,                                  /* tp_hash */
1216
    0,                                  /* tp_call */
1217
    0,                                  /* tp_str */
1218
    PyObject_GenericGetAttr,            /* tp_getattro */
1219
    0,                                  /* tp_setattro */
1220
    0,                                  /* tp_as_buffer */
1221
    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1222
    0,                                  /* tp_doc */
1223
    0,                                  /* tp_traverse */
1224
    0,                                  /* tp_clear */
1225
    0,                                  /* tp_richcompare */
1226
    0,                                  /* tp_weaklistoffset */
1227
    PyObject_SelfIter,                  /* tp_iter */
1228
    fieldnameiter_next,                 /* tp_iternext */
1229
    fieldnameiter_methods,              /* tp_methods */
1230
    0};
1231
1232
/* unicode_formatter_field_name_split is used to implement
1233
   string.Formatter.vformat.  it takes a PEP 3101 "field name", and
1234
   returns a tuple of (first, rest): "first", the part before the
1235
   first '.' or '['; and "rest", an iterator for the rest of the field
1236
   name.  it's a wrapper around stringlib/string_format.h's
1237
   field_name_split.  The iterator it returns is a
1238
   FieldNameIterator */
1239
static PyObject *
1240
formatter_field_name_split(PyObject *Py_UNUSED(module), PyObject *self)
1241
0
{
1242
0
    SubString first;
1243
0
    Py_ssize_t first_idx;
1244
0
    fieldnameiterobject *it;
1245
1246
0
    PyObject *first_obj = NULL;
1247
0
    PyObject *result = NULL;
1248
1249
0
    if (!PyUnicode_Check(self)) {
1250
0
        PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1251
0
        return NULL;
1252
0
    }
1253
1254
0
    it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type);
1255
0
    if (it == NULL)
1256
0
        return NULL;
1257
1258
    /* take ownership, give the object to the iterator.  this is
1259
       just to keep the field_name alive */
1260
0
    it->str = Py_NewRef(self);
1261
1262
    /* Pass in auto_number = NULL. We'll return an empty string for
1263
       first_obj in that case. */
1264
0
    if (!field_name_split((PyObject*)self, 0, PyUnicode_GET_LENGTH(self),
1265
0
                          &first, &first_idx, &it->it_field, NULL))
1266
0
        goto error;
1267
1268
    /* first becomes an integer, if possible; else a string */
1269
0
    if (first_idx != -1)
1270
0
        first_obj = PyLong_FromSsize_t(first_idx);
1271
0
    else
1272
        /* convert "first" into a string object */
1273
0
        first_obj = SubString_new_object(&first);
1274
0
    if (first_obj == NULL)
1275
0
        goto error;
1276
1277
    /* return a tuple of values */
1278
0
    return _PyTuple_FromPairSteal(first_obj, (PyObject *)it);
1279
1280
0
error:
1281
0
    Py_XDECREF(it);
1282
0
    Py_XDECREF(first_obj);
1283
0
    return result;
1284
0
}