Coverage Report

Created: 2026-05-16 06:46

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Objects/stringlib/unicode_format.h
Line
Count
Source
1
/*
2
    unicode_format.h -- implementation of str.format().
3
*/
4
5
#include "pycore_complexobject.h" // _PyComplex_FormatAdvancedWriter()
6
#include "pycore_floatobject.h"   // _PyFloat_FormatAdvancedWriter()
7
#include "pycore_tuple.h"         // _PyTuple_FromPairSteal
8
9
/************************************************************************/
10
/***********   Global data structures and forward declarations  *********/
11
/************************************************************************/
12
13
/*
14
   A SubString consists of the characters between two string or
15
   unicode pointers.
16
*/
17
typedef struct {
18
    PyObject *str; /* borrowed reference */
19
    Py_ssize_t start, end;
20
} SubString;
21
22
23
typedef enum {
24
    ANS_INIT,
25
    ANS_AUTO,
26
    ANS_MANUAL
27
} AutoNumberState;   /* Keep track if we're auto-numbering fields */
28
29
/* Keeps track of our auto-numbering state, and which number field we're on */
30
typedef struct {
31
    AutoNumberState an_state;
32
    int an_field_number;
33
} AutoNumber;
34
35
36
/* forward declaration for recursion */
37
static PyObject *
38
build_string(SubString *input, PyObject *args, PyObject *kwargs,
39
             int recursion_depth, AutoNumber *auto_number);
40
41
42
43
/************************************************************************/
44
/**************************  Utility  functions  ************************/
45
/************************************************************************/
46
47
static void
48
AutoNumber_Init(AutoNumber *auto_number)
49
8.34M
{
50
8.34M
    auto_number->an_state = ANS_INIT;
51
8.34M
    auto_number->an_field_number = 0;
52
8.34M
}
53
54
/* fill in a SubString from a pointer and length */
55
Py_LOCAL_INLINE(void)
56
SubString_init(SubString *str, PyObject *s, Py_ssize_t start, Py_ssize_t end)
57
158M
{
58
158M
    str->str = s;
59
158M
    str->start = start;
60
158M
    str->end = end;
61
158M
}
62
63
/* return a new string.  if str->str is NULL, return None */
64
Py_LOCAL_INLINE(PyObject *)
65
SubString_new_object(SubString *str)
66
45.8k
{
67
45.8k
    if (str->str == NULL)
68
0
        Py_RETURN_NONE;
69
45.8k
    return PyUnicode_Substring(str->str, str->start, str->end);
70
45.8k
}
71
72
/* return a new string.  if str->str is NULL, return a new empty string */
73
Py_LOCAL_INLINE(PyObject *)
74
SubString_new_object_or_empty(SubString *str)
75
0
{
76
0
    if (str->str == NULL) {
77
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_STR);
78
0
    }
79
0
    return SubString_new_object(str);
80
0
}
81
82
/* Return 1 if an error has been detected switching between automatic
83
   field numbering and manual field specification, else return 0. Set
84
   ValueError on error. */
85
static int
86
autonumber_state_error(AutoNumberState state, int field_name_is_empty)
87
15.6M
{
88
15.6M
    if (state == ANS_MANUAL) {
89
432
        if (field_name_is_empty) {
90
0
            PyErr_SetString(PyExc_ValueError, "cannot switch from "
91
0
                            "manual field specification to "
92
0
                            "automatic field numbering");
93
0
            return 1;
94
0
        }
95
432
    }
96
15.6M
    else {
97
15.6M
        if (!field_name_is_empty) {
98
0
            PyErr_SetString(PyExc_ValueError, "cannot switch from "
99
0
                            "automatic field numbering to "
100
0
                            "manual field specification");
101
0
            return 1;
102
0
        }
103
15.6M
    }
104
15.6M
    return 0;
105
15.6M
}
106
107
108
/************************************************************************/
109
/***********  Format string parsing -- integers and identifiers *********/
110
/************************************************************************/
111
112
static Py_ssize_t
113
get_integer(const SubString *str)
114
15.7M
{
115
15.7M
    Py_ssize_t accumulator = 0;
116
15.7M
    Py_ssize_t digitval;
117
15.7M
    Py_ssize_t i;
118
119
    /* empty string is an error */
120
15.7M
    if (str->start >= str->end)
121
15.6M
        return -1;
122
123
46.6k
    for (i = str->start; i < str->end; i++) {
124
46.2k
        digitval = Py_UNICODE_TODECIMAL(PyUnicode_READ_CHAR(str->str, i));
125
46.2k
        if (digitval < 0)
126
45.8k
            return -1;
127
        /*
128
           Detect possible overflow before it happens:
129
130
              accumulator * 10 + digitval > PY_SSIZE_T_MAX if and only if
131
              accumulator > (PY_SSIZE_T_MAX - digitval) / 10.
132
        */
133
432
        if (accumulator > (PY_SSIZE_T_MAX - digitval) / 10) {
134
0
            PyErr_Format(PyExc_ValueError,
135
0
                         "Too many decimal digits in format string");
136
0
            return -1;
137
0
        }
138
432
        accumulator = accumulator * 10 + digitval;
139
432
    }
140
432
    return accumulator;
141
46.2k
}
142
143
/************************************************************************/
144
/******** Functions to get field objects and specification strings ******/
145
/************************************************************************/
146
147
/* do the equivalent of obj.name */
148
static PyObject *
149
getattr(PyObject *obj, SubString *name)
150
4
{
151
4
    PyObject *newobj;
152
4
    PyObject *str = SubString_new_object(name);
153
4
    if (str == NULL)
154
0
        return NULL;
155
4
    newobj = PyObject_GetAttr(obj, str);
156
4
    Py_DECREF(str);
157
4
    return newobj;
158
4
}
159
160
/* do the equivalent of obj[idx], where obj is a sequence */
161
static PyObject *
162
getitem_sequence(PyObject *obj, Py_ssize_t idx)
163
0
{
164
0
    return PySequence_GetItem(obj, idx);
165
0
}
166
167
/* do the equivalent of obj[idx], where obj is not a sequence */
168
static PyObject *
169
getitem_idx(PyObject *obj, Py_ssize_t idx)
170
0
{
171
0
    PyObject *newobj;
172
0
    PyObject *idx_obj = PyLong_FromSsize_t(idx);
173
0
    if (idx_obj == NULL)
174
0
        return NULL;
175
0
    newobj = PyObject_GetItem(obj, idx_obj);
176
0
    Py_DECREF(idx_obj);
177
0
    return newobj;
178
0
}
179
180
/* do the equivalent of obj[name] */
181
static PyObject *
182
getitem_str(PyObject *obj, SubString *name)
183
0
{
184
0
    PyObject *newobj;
185
0
    PyObject *str = SubString_new_object(name);
186
0
    if (str == NULL)
187
0
        return NULL;
188
0
    newobj = PyObject_GetItem(obj, str);
189
0
    Py_DECREF(str);
190
0
    return newobj;
191
0
}
192
193
typedef struct {
194
    /* the entire string we're parsing.  we assume that someone else
195
       is managing its lifetime, and that it will exist for the
196
       lifetime of the iterator.  can be empty */
197
    SubString str;
198
199
    /* index to where we are inside field_name */
200
    Py_ssize_t index;
201
} FieldNameIterator;
202
203
204
static int
205
FieldNameIterator_init(FieldNameIterator *self, PyObject *s,
206
                       Py_ssize_t start, Py_ssize_t end)
207
15.7M
{
208
15.7M
    SubString_init(&self->str, s, start, end);
209
15.7M
    self->index = start;
210
15.7M
    return 1;
211
15.7M
}
212
213
static int
214
_FieldNameIterator_attr(FieldNameIterator *self, SubString *name)
215
4
{
216
4
    Py_UCS4 c;
217
218
4
    name->str = self->str.str;
219
4
    name->start = self->index;
220
221
    /* return everything until '.' or '[' */
222
52
    while (self->index < self->str.end) {
223
48
        c = PyUnicode_READ_CHAR(self->str.str, self->index++);
224
48
        switch (c) {
225
0
        case '[':
226
0
        case '.':
227
            /* backup so that we this character will be seen next time */
228
0
            self->index--;
229
0
            break;
230
48
        default:
231
48
            continue;
232
48
        }
233
0
        break;
234
48
    }
235
    /* end of string is okay */
236
4
    name->end = self->index;
237
4
    return 1;
238
4
}
239
240
static int
241
_FieldNameIterator_item(FieldNameIterator *self, SubString *name)
242
0
{
243
0
    int bracket_seen = 0;
244
0
    Py_UCS4 c;
245
246
0
    name->str = self->str.str;
247
0
    name->start = self->index;
248
249
    /* return everything until ']' */
250
0
    while (self->index < self->str.end) {
251
0
        c = PyUnicode_READ_CHAR(self->str.str, self->index++);
252
0
        switch (c) {
253
0
        case ']':
254
0
            bracket_seen = 1;
255
0
            break;
256
0
        default:
257
0
            continue;
258
0
        }
259
0
        break;
260
0
    }
261
    /* make sure we ended with a ']' */
262
0
    if (!bracket_seen) {
263
0
        PyErr_SetString(PyExc_ValueError, "Missing ']' in format string");
264
0
        return 0;
265
0
    }
266
267
    /* end of string is okay */
268
    /* don't include the ']' */
269
0
    name->end = self->index-1;
270
0
    return 1;
271
0
}
272
273
/* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */
274
static int
275
FieldNameIterator_next(FieldNameIterator *self, int *is_attribute,
276
                       Py_ssize_t *name_idx, SubString *name)
277
15.7M
{
278
    /* check at end of input */
279
15.7M
    if (self->index >= self->str.end)
280
15.7M
        return 1;
281
282
4
    switch (PyUnicode_READ_CHAR(self->str.str, self->index++)) {
283
4
    case '.':
284
4
        *is_attribute = 1;
285
4
        if (_FieldNameIterator_attr(self, name) == 0)
286
0
            return 0;
287
4
        *name_idx = -1;
288
4
        break;
289
0
    case '[':
290
0
        *is_attribute = 0;
291
0
        if (_FieldNameIterator_item(self, name) == 0)
292
0
            return 0;
293
0
        *name_idx = get_integer(name);
294
0
        if (*name_idx == -1 && PyErr_Occurred())
295
0
            return 0;
296
0
        break;
297
0
    default:
298
        /* Invalid character follows ']' */
299
0
        PyErr_SetString(PyExc_ValueError, "Only '.' or '[' may "
300
0
                        "follow ']' in format field specifier");
301
0
        return 0;
302
4
    }
303
304
    /* empty string is an error */
305
4
    if (name->start == name->end) {
306
0
        PyErr_SetString(PyExc_ValueError, "Empty attribute in format string");
307
0
        return 0;
308
0
    }
309
310
4
    return 2;
311
4
}
312
313
314
/* input: field_name
315
   output: 'first' points to the part before the first '[' or '.'
316
           'first_idx' is -1 if 'first' is not an integer, otherwise
317
                       it's the value of first converted to an integer
318
           'rest' is an iterator to return the rest
319
*/
320
static int
321
field_name_split(PyObject *str, Py_ssize_t start, Py_ssize_t end, SubString *first,
322
                 Py_ssize_t *first_idx, FieldNameIterator *rest,
323
                 AutoNumber *auto_number)
324
15.7M
{
325
15.7M
    Py_UCS4 c;
326
15.7M
    Py_ssize_t i = start;
327
15.7M
    int field_name_is_empty;
328
15.7M
    int using_numeric_index;
329
330
    /* find the part up until the first '.' or '[' */
331
16.0M
    while (i < end) {
332
332k
        switch (c = PyUnicode_READ_CHAR(str, i++)) {
333
0
        case '[':
334
4
        case '.':
335
            /* backup so that we this character is available to the
336
               "rest" iterator */
337
4
            i--;
338
4
            break;
339
332k
        default:
340
332k
            continue;
341
332k
        }
342
4
        break;
343
332k
    }
344
345
    /* set up the return values */
346
15.7M
    SubString_init(first, str, start, i);
347
15.7M
    FieldNameIterator_init(rest, str, i, end);
348
349
    /* see if "first" is an integer, in which case it's used as an index */
350
15.7M
    *first_idx = get_integer(first);
351
15.7M
    if (*first_idx == -1 && PyErr_Occurred())
352
0
        return 0;
353
354
15.7M
    field_name_is_empty = first->start >= first->end;
355
356
    /* If the field name is omitted or if we have a numeric index
357
       specified, then we're doing numeric indexing into args. */
358
15.7M
    using_numeric_index = field_name_is_empty || *first_idx != -1;
359
360
    /* We always get here exactly one time for each field we're
361
       processing. And we get here in field order (counting by left
362
       braces). So this is the perfect place to handle automatic field
363
       numbering if the field name is omitted. */
364
365
    /* Check if we need to do the auto-numbering. It's not needed if
366
       we're called from string.Format routines, because it's handled
367
       in that class by itself. */
368
15.7M
    if (auto_number) {
369
        /* Initialize our auto numbering state if this is the first
370
           time we're either auto-numbering or manually numbering. */
371
15.7M
        if (auto_number->an_state == ANS_INIT && using_numeric_index)
372
8.33M
            auto_number->an_state = field_name_is_empty ?
373
8.33M
                ANS_AUTO : ANS_MANUAL;
374
375
        /* Make sure our state is consistent with what we're doing
376
           this time through. Only check if we're using a numeric
377
           index. */
378
15.7M
        if (using_numeric_index)
379
15.6M
            if (autonumber_state_error(auto_number->an_state,
380
15.6M
                                       field_name_is_empty))
381
0
                return 0;
382
        /* Zero length field means we want to do auto-numbering of the
383
           fields. */
384
15.7M
        if (field_name_is_empty)
385
15.6M
            *first_idx = (auto_number->an_field_number)++;
386
15.7M
    }
387
388
15.7M
    return 1;
389
15.7M
}
390
391
392
/*
393
    get_field_object returns the object inside {}, before the
394
    format_spec.  It handles getindex and getattr lookups and consumes
395
    the entire input string.
396
*/
397
static PyObject *
398
get_field_object(SubString *input, PyObject *args, PyObject *kwargs,
399
                 AutoNumber *auto_number)
400
15.7M
{
401
15.7M
    PyObject *obj = NULL;
402
15.7M
    int ok;
403
15.7M
    int is_attribute;
404
15.7M
    SubString name;
405
15.7M
    SubString first;
406
15.7M
    Py_ssize_t index;
407
15.7M
    FieldNameIterator rest;
408
409
15.7M
    if (!field_name_split(input->str, input->start, input->end, &first,
410
15.7M
                          &index, &rest, auto_number)) {
411
0
        goto error;
412
0
    }
413
414
15.7M
    if (index == -1) {
415
        /* look up in kwargs */
416
45.8k
        PyObject *key = SubString_new_object(&first);
417
45.8k
        if (key == NULL) {
418
0
            goto error;
419
0
        }
420
45.8k
        if (kwargs == NULL) {
421
0
            PyErr_SetObject(PyExc_KeyError, key);
422
0
            Py_DECREF(key);
423
0
            goto error;
424
0
        }
425
        /* Use PyObject_GetItem instead of PyDict_GetItem because this
426
           code is no longer just used with kwargs. It might be passed
427
           a non-dict when called through format_map. */
428
45.8k
        obj = PyObject_GetItem(kwargs, key);
429
45.8k
        Py_DECREF(key);
430
45.8k
        if (obj == NULL) {
431
0
            goto error;
432
0
        }
433
45.8k
    }
434
15.6M
    else {
435
        /* If args is NULL, we have a format string with a positional field
436
           with only kwargs to retrieve it from. This can only happen when
437
           used with format_map(), where positional arguments are not
438
           allowed. */
439
15.6M
        if (args == NULL) {
440
0
            PyErr_SetString(PyExc_ValueError, "Format string contains "
441
0
                            "positional fields");
442
0
            goto error;
443
0
        }
444
445
        /* look up in args */
446
15.6M
        obj = PySequence_GetItem(args, index);
447
15.6M
        if (obj == NULL) {
448
0
            PyErr_Format(PyExc_IndexError,
449
0
                         "Replacement index %zd out of range for positional "
450
0
                         "args tuple",
451
0
                         index);
452
0
             goto error;
453
0
        }
454
15.6M
    }
455
456
    /* iterate over the rest of the field_name */
457
15.7M
    while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index,
458
15.7M
                                        &name)) == 2) {
459
4
        PyObject *tmp;
460
461
4
        if (is_attribute)
462
            /* getattr lookup "." */
463
4
            tmp = getattr(obj, &name);
464
0
        else
465
            /* getitem lookup "[]" */
466
0
            if (index == -1)
467
0
                tmp = getitem_str(obj, &name);
468
0
            else
469
0
                if (PySequence_Check(obj))
470
0
                    tmp = getitem_sequence(obj, index);
471
0
                else
472
                    /* not a sequence */
473
0
                    tmp = getitem_idx(obj, index);
474
4
        if (tmp == NULL)
475
0
            goto error;
476
477
        /* assign to obj */
478
4
        Py_SETREF(obj, tmp);
479
4
    }
480
    /* end of iterator, this is the non-error case */
481
15.7M
    if (ok == 1)
482
15.7M
        return obj;
483
0
error:
484
0
    Py_XDECREF(obj);
485
0
    return NULL;
486
15.7M
}
487
488
/************************************************************************/
489
/*****************  Field rendering functions  **************************/
490
/************************************************************************/
491
492
/*
493
    render_field() is the main function in this section.  It takes the
494
    field object and field specification string generated by
495
    get_field_and_spec, and renders the field into the output string.
496
497
    render_field calls fieldobj.__format__(format_spec) method, and
498
    appends to the output.
499
*/
500
static int
501
render_field(PyObject *fieldobj, SubString *format_spec, _PyUnicodeWriter *writer)
502
15.7M
{
503
15.7M
    int ok = 0;
504
15.7M
    PyObject *result = NULL;
505
15.7M
    PyObject *format_spec_object = NULL;
506
15.7M
    int (*formatter) (_PyUnicodeWriter*, PyObject *, PyObject *, Py_ssize_t, Py_ssize_t) = NULL;
507
15.7M
    int err;
508
509
    /* If we know the type exactly, skip the lookup of __format__ and just
510
       call the formatter directly. */
511
15.7M
    if (PyUnicode_CheckExact(fieldobj))
512
15.5M
        formatter = _PyUnicode_FormatAdvancedWriter;
513
190k
    else if (PyLong_CheckExact(fieldobj))
514
61.4k
        formatter = _PyLong_FormatAdvancedWriter;
515
128k
    else if (PyFloat_CheckExact(fieldobj))
516
0
        formatter = _PyFloat_FormatAdvancedWriter;
517
128k
    else if (PyComplex_CheckExact(fieldobj))
518
0
        formatter = _PyComplex_FormatAdvancedWriter;
519
520
15.7M
    if (formatter) {
521
        /* we know exactly which formatter will be called when __format__ is
522
           looked up, so call it directly, instead. */
523
15.5M
        err = formatter(writer, fieldobj, format_spec->str,
524
15.5M
                        format_spec->start, format_spec->end);
525
15.5M
        return (err == 0);
526
15.5M
    }
527
128k
    else {
528
        /* We need to create an object out of the pointers we have, because
529
           __format__ takes a string/unicode object for format_spec. */
530
128k
        if (format_spec->str)
531
0
            format_spec_object = PyUnicode_Substring(format_spec->str,
532
0
                                                     format_spec->start,
533
0
                                                     format_spec->end);
534
128k
        else
535
128k
            format_spec_object = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
536
128k
        if (format_spec_object == NULL)
537
0
            goto done;
538
539
128k
        result = PyObject_Format(fieldobj, format_spec_object);
540
128k
    }
541
128k
    if (result == NULL)
542
1
        goto done;
543
544
128k
    if (_PyUnicodeWriter_WriteStr(writer, result) == -1)
545
0
        goto done;
546
128k
    ok = 1;
547
548
128k
done:
549
128k
    Py_XDECREF(format_spec_object);
550
128k
    Py_XDECREF(result);
551
128k
    return ok;
552
128k
}
553
554
static int
555
parse_field(SubString *str, SubString *field_name, SubString *format_spec,
556
            int *format_spec_needs_expanding, Py_UCS4 *conversion)
557
15.7M
{
558
    /* Note this function works if the field name is zero length,
559
       which is good.  Zero length field names are handled later, in
560
       field_name_split. */
561
562
15.7M
    Py_UCS4 c = 0;
563
564
    /* initialize these, as they may be empty */
565
15.7M
    *conversion = '\0';
566
15.7M
    SubString_init(format_spec, NULL, 0, 0);
567
568
    /* Search for the field name.  it's terminated by the end of
569
       the string, or a ':' or '!' */
570
15.7M
    field_name->str = str->str;
571
15.7M
    field_name->start = str->start;
572
16.0M
    while (str->start < str->end) {
573
16.0M
        switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
574
0
        case '{':
575
0
            PyErr_SetString(PyExc_ValueError, "unexpected '{' in field name");
576
0
            return 0;
577
0
        case '[':
578
0
            for (; str->start < str->end; str->start++)
579
0
                if (PyUnicode_READ_CHAR(str->str, str->start) == ']')
580
0
                    break;
581
0
            continue;
582
15.2M
        case '}':
583
15.2M
        case ':':
584
15.7M
        case '!':
585
15.7M
            break;
586
332k
        default:
587
332k
            continue;
588
16.0M
        }
589
15.7M
        break;
590
16.0M
    }
591
592
15.7M
    field_name->end = str->start - 1;
593
15.7M
    if (c == '!' || c == ':') {
594
508k
        Py_ssize_t count;
595
        /* we have a format specifier and/or a conversion */
596
        /* don't include the last character */
597
598
        /* see if there's a conversion specifier */
599
508k
        if (c == '!') {
600
            /* there must be another character present */
601
508k
            if (str->start >= str->end) {
602
0
                PyErr_SetString(PyExc_ValueError,
603
0
                                "end of string while looking for conversion "
604
0
                                "specifier");
605
0
                return 0;
606
0
            }
607
508k
            *conversion = PyUnicode_READ_CHAR(str->str, str->start++);
608
609
508k
            if (str->start < str->end) {
610
508k
                c = PyUnicode_READ_CHAR(str->str, str->start++);
611
508k
                if (c == '}')
612
508k
                    return 1;
613
0
                if (c != ':') {
614
0
                    PyErr_SetString(PyExc_ValueError,
615
0
                                    "expected ':' after conversion specifier");
616
0
                    return 0;
617
0
                }
618
0
            }
619
508k
        }
620
192
        format_spec->str = str->str;
621
192
        format_spec->start = str->start;
622
192
        count = 1;
623
768
        while (str->start < str->end) {
624
768
            switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
625
0
            case '{':
626
0
                *format_spec_needs_expanding = 1;
627
0
                count++;
628
0
                break;
629
192
            case '}':
630
192
                count--;
631
192
                if (count == 0) {
632
192
                    format_spec->end = str->start - 1;
633
192
                    return 1;
634
192
                }
635
0
                break;
636
576
            default:
637
576
                break;
638
768
            }
639
768
        }
640
641
0
        PyErr_SetString(PyExc_ValueError, "unmatched '{' in format spec");
642
0
        return 0;
643
192
    }
644
15.2M
    else if (c != '}') {
645
0
        PyErr_SetString(PyExc_ValueError, "expected '}' before end of string");
646
0
        return 0;
647
0
    }
648
649
15.2M
    return 1;
650
15.7M
}
651
652
/************************************************************************/
653
/******* Output string allocation and escape-to-markup processing  ******/
654
/************************************************************************/
655
656
/* MarkupIterator breaks the string into pieces of either literal
657
   text, or things inside {} that need to be marked up.  it is
658
   designed to make it easy to wrap a Python iterator around it, for
659
   use with the Formatter class */
660
661
typedef struct {
662
    SubString str;
663
} MarkupIterator;
664
665
static int
666
MarkupIterator_init(MarkupIterator *self, PyObject *str,
667
                    Py_ssize_t start, Py_ssize_t end)
668
8.34M
{
669
8.34M
    SubString_init(&self->str, str, start, end);
670
8.34M
    return 1;
671
8.34M
}
672
673
/* returns 0 on error, 1 on non-error termination, and 2 if it got a
674
   string (or something to be expanded) */
675
static int
676
MarkupIterator_next(MarkupIterator *self, SubString *literal,
677
                    int *field_present, SubString *field_name,
678
                    SubString *format_spec, Py_UCS4 *conversion,
679
                    int *format_spec_needs_expanding)
680
31.6M
{
681
31.6M
    int at_end;
682
31.6M
    Py_UCS4 c = 0;
683
31.6M
    Py_ssize_t start;
684
31.6M
    Py_ssize_t len;
685
31.6M
    int markup_follows = 0;
686
687
    /* initialize all of the output variables */
688
31.6M
    SubString_init(literal, NULL, 0, 0);
689
31.6M
    SubString_init(field_name, NULL, 0, 0);
690
31.6M
    SubString_init(format_spec, NULL, 0, 0);
691
31.6M
    *conversion = '\0';
692
31.6M
    *format_spec_needs_expanding = 0;
693
31.6M
    *field_present = 0;
694
695
    /* No more input, end of iterator.  This is the normal exit
696
       path. */
697
31.6M
    if (self->str.start >= self->str.end)
698
8.34M
        return 1;
699
700
23.3M
    start = self->str.start;
701
702
    /* First read any literal text. Read until the end of string, an
703
       escaped '{' or '}', or an unescaped '{'.  In order to never
704
       allocate memory and so I can just pass pointers around, if
705
       there's an escaped '{' or '}' then we'll return the literal
706
       including the brace, but no format object.  The next time
707
       through, we'll return the rest of the literal, skipping past
708
       the second consecutive brace. */
709
69.7M
    while (self->str.start < self->str.end) {
710
62.1M
        switch (c = PyUnicode_READ_CHAR(self->str.str, self->str.start++)) {
711
15.7M
        case '{':
712
15.7M
        case '}':
713
15.7M
            markup_follows = 1;
714
15.7M
            break;
715
46.3M
        default:
716
46.3M
            continue;
717
62.1M
        }
718
15.7M
        break;
719
62.1M
    }
720
721
23.3M
    at_end = self->str.start >= self->str.end;
722
23.3M
    len = self->str.start - start;
723
724
23.3M
    if ((c == '}') && (at_end ||
725
0
                       (c != PyUnicode_READ_CHAR(self->str.str,
726
0
                                                 self->str.start)))) {
727
0
        PyErr_SetString(PyExc_ValueError, "Single '}' encountered "
728
0
                        "in format string");
729
0
        return 0;
730
0
    }
731
23.3M
    if (at_end && c == '{') {
732
0
        PyErr_SetString(PyExc_ValueError, "Single '{' encountered "
733
0
                        "in format string");
734
0
        return 0;
735
0
    }
736
23.3M
    if (!at_end) {
737
15.7M
        if (c == PyUnicode_READ_CHAR(self->str.str, self->str.start)) {
738
            /* escaped } or {, skip it in the input.  there is no
739
               markup object following us, just this literal text */
740
0
            self->str.start++;
741
0
            markup_follows = 0;
742
0
        }
743
15.7M
        else
744
15.7M
            len--;
745
15.7M
    }
746
747
    /* record the literal text */
748
23.3M
    literal->str = self->str.str;
749
23.3M
    literal->start = start;
750
23.3M
    literal->end = start + len;
751
752
23.3M
    if (!markup_follows)
753
7.58M
        return 2;
754
755
    /* this is markup; parse the field */
756
15.7M
    *field_present = 1;
757
15.7M
    if (!parse_field(&self->str, field_name, format_spec,
758
15.7M
                     format_spec_needs_expanding, conversion))
759
0
        return 0;
760
15.7M
    return 2;
761
15.7M
}
762
763
764
/* do the !r or !s conversion on obj */
765
static PyObject *
766
do_conversion(PyObject *obj, Py_UCS4 conversion)
767
508k
{
768
    /* XXX in pre-3.0, do we need to convert this to unicode, since it
769
       might have returned a string? */
770
508k
    switch (conversion) {
771
508k
    case 'r':
772
508k
        return PyObject_Repr(obj);
773
0
    case 's':
774
0
        return PyObject_Str(obj);
775
0
    case 'a':
776
0
        return PyObject_ASCII(obj);
777
0
    default:
778
0
        if (conversion > 32 && conversion < 127) {
779
                /* It's the ASCII subrange; casting to char is safe
780
                   (assuming the execution character set is an ASCII
781
                   superset). */
782
0
                PyErr_Format(PyExc_ValueError,
783
0
                     "Unknown conversion specifier %c",
784
0
                     (char)conversion);
785
0
        } else
786
0
                PyErr_Format(PyExc_ValueError,
787
0
                     "Unknown conversion specifier \\x%x",
788
0
                     (unsigned int)conversion);
789
0
        return NULL;
790
508k
    }
791
508k
}
792
793
/* given:
794
795
   {field_name!conversion:format_spec}
796
797
   compute the result and write it to output.
798
   format_spec_needs_expanding is an optimization.  if it's false,
799
   just output the string directly, otherwise recursively expand the
800
   format_spec string.
801
802
   field_name is allowed to be zero length, in which case we
803
   are doing auto field numbering.
804
*/
805
806
static int
807
output_markup(SubString *field_name, SubString *format_spec,
808
              int format_spec_needs_expanding, Py_UCS4 conversion,
809
              _PyUnicodeWriter *writer, PyObject *args, PyObject *kwargs,
810
              int recursion_depth, AutoNumber *auto_number)
811
15.7M
{
812
15.7M
    PyObject *tmp = NULL;
813
15.7M
    PyObject *fieldobj = NULL;
814
15.7M
    SubString expanded_format_spec;
815
15.7M
    SubString *actual_format_spec;
816
15.7M
    int result = 0;
817
818
    /* convert field_name to an object */
819
15.7M
    fieldobj = get_field_object(field_name, args, kwargs, auto_number);
820
15.7M
    if (fieldobj == NULL)
821
0
        goto done;
822
823
15.7M
    if (conversion != '\0') {
824
508k
        tmp = do_conversion(fieldobj, conversion);
825
508k
        if (tmp == NULL)
826
0
            goto done;
827
828
        /* do the assignment, transferring ownership: fieldobj = tmp */
829
508k
        Py_SETREF(fieldobj, tmp);
830
508k
        tmp = NULL;
831
508k
    }
832
833
    /* if needed, recursively compute the format_spec */
834
15.7M
    if (format_spec_needs_expanding) {
835
0
        tmp = build_string(format_spec, args, kwargs, recursion_depth-1,
836
0
                           auto_number);
837
0
        if (tmp == NULL)
838
0
            goto done;
839
840
        /* note that in the case we're expanding the format string,
841
           tmp must be kept around until after the call to
842
           render_field. */
843
0
        SubString_init(&expanded_format_spec, tmp, 0, PyUnicode_GET_LENGTH(tmp));
844
0
        actual_format_spec = &expanded_format_spec;
845
0
    }
846
15.7M
    else
847
15.7M
        actual_format_spec = format_spec;
848
849
15.7M
    if (render_field(fieldobj, actual_format_spec, writer) == 0)
850
1
        goto done;
851
852
15.7M
    result = 1;
853
854
15.7M
done:
855
15.7M
    Py_XDECREF(fieldobj);
856
15.7M
    Py_XDECREF(tmp);
857
858
15.7M
    return result;
859
15.7M
}
860
861
/*
862
    do_markup is the top-level loop for the format() method.  It
863
    searches through the format string for escapes to markup codes, and
864
    calls other functions to move non-markup text to the output,
865
    and to perform the markup to the output.
866
*/
867
static int
868
do_markup(SubString *input, PyObject *args, PyObject *kwargs,
869
          _PyUnicodeWriter *writer, int recursion_depth, AutoNumber *auto_number)
870
8.34M
{
871
8.34M
    MarkupIterator iter;
872
8.34M
    int format_spec_needs_expanding;
873
8.34M
    int result;
874
8.34M
    int field_present;
875
8.34M
    SubString literal;
876
8.34M
    SubString field_name;
877
8.34M
    SubString format_spec;
878
8.34M
    Py_UCS4 conversion;
879
880
8.34M
    MarkupIterator_init(&iter, input->str, input->start, input->end);
881
31.6M
    while ((result = MarkupIterator_next(&iter, &literal, &field_present,
882
31.6M
                                         &field_name, &format_spec,
883
31.6M
                                         &conversion,
884
31.6M
                                         &format_spec_needs_expanding)) == 2) {
885
23.3M
        if (literal.end != literal.start) {
886
15.9M
            if (!field_present && iter.str.start == iter.str.end)
887
7.58M
                writer->overallocate = 0;
888
15.9M
            if (_PyUnicodeWriter_WriteSubstring(writer, literal.str,
889
15.9M
                                                literal.start, literal.end) < 0)
890
0
                return 0;
891
15.9M
        }
892
893
23.3M
        if (field_present) {
894
15.7M
            if (iter.str.start == iter.str.end)
895
752k
                writer->overallocate = 0;
896
15.7M
            if (!output_markup(&field_name, &format_spec,
897
15.7M
                               format_spec_needs_expanding, conversion, writer,
898
15.7M
                               args, kwargs, recursion_depth, auto_number))
899
1
                return 0;
900
15.7M
        }
901
23.3M
    }
902
8.34M
    return result;
903
8.34M
}
904
905
906
/*
907
    build_string allocates the output string and then
908
    calls do_markup to do the heavy lifting.
909
*/
910
static PyObject *
911
build_string(SubString *input, PyObject *args, PyObject *kwargs,
912
             int recursion_depth, AutoNumber *auto_number)
913
8.34M
{
914
8.34M
    _PyUnicodeWriter writer;
915
916
    /* check the recursion level */
917
8.34M
    if (recursion_depth <= 0) {
918
0
        PyErr_SetString(PyExc_ValueError,
919
0
                        "Max string recursion exceeded");
920
0
        return NULL;
921
0
    }
922
923
8.34M
    _PyUnicodeWriter_Init(&writer);
924
8.34M
    writer.overallocate = 1;
925
8.34M
    writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
926
927
8.34M
    if (!do_markup(input, args, kwargs, &writer, recursion_depth,
928
8.34M
                   auto_number)) {
929
1
        _PyUnicodeWriter_Dealloc(&writer);
930
1
        return NULL;
931
1
    }
932
933
8.34M
    return _PyUnicodeWriter_Finish(&writer);
934
8.34M
}
935
936
/************************************************************************/
937
/*********** main routine ***********************************************/
938
/************************************************************************/
939
940
/* this is the main entry point */
941
static PyObject *
942
do_string_format(PyObject *self, PyObject *args, PyObject *kwargs)
943
8.34M
{
944
8.34M
    SubString input;
945
946
    /* PEP 3101 says only 2 levels, so that
947
       "{0:{1}}".format('abc', 's')            # works
948
       "{0:{1:{2}}}".format('abc', 's', '')    # fails
949
    */
950
8.34M
    int recursion_depth = 2;
951
952
8.34M
    AutoNumber auto_number;
953
8.34M
    AutoNumber_Init(&auto_number);
954
8.34M
    SubString_init(&input, self, 0, PyUnicode_GET_LENGTH(self));
955
8.34M
    return build_string(&input, args, kwargs, recursion_depth, &auto_number);
956
8.34M
}
957
958
static PyObject *
959
do_string_format_map(PyObject *self, PyObject *obj)
960
0
{
961
0
    return do_string_format(self, NULL, obj);
962
0
}
963
964
965
/************************************************************************/
966
/*********** formatteriterator ******************************************/
967
/************************************************************************/
968
969
/* This is used to implement string.Formatter.vparse().  It exists so
970
   Formatter can share code with the built in unicode.format() method.
971
   It's really just a wrapper around MarkupIterator that is callable
972
   from Python. */
973
974
typedef struct {
975
    PyObject_HEAD
976
    PyObject *str;
977
    MarkupIterator it_markup;
978
} formatteriterobject;
979
980
static void
981
formatteriter_dealloc(PyObject *op)
982
0
{
983
0
    formatteriterobject *it = (formatteriterobject*)op;
984
0
    Py_XDECREF(it->str);
985
0
    PyObject_Free(it);
986
0
}
987
988
/* returns a tuple:
989
   (literal, field_name, format_spec, conversion)
990
991
   literal is any literal text to output.  might be zero length
992
   field_name is the string before the ':'.  might be None
993
   format_spec is the string after the ':'.  mibht be None
994
   conversion is either None, or the string after the '!'
995
*/
996
static PyObject *
997
formatteriter_next(PyObject *op)
998
0
{
999
0
    formatteriterobject *it = (formatteriterobject*)op;
1000
0
    SubString literal;
1001
0
    SubString field_name;
1002
0
    SubString format_spec;
1003
0
    Py_UCS4 conversion;
1004
0
    int format_spec_needs_expanding;
1005
0
    int field_present;
1006
0
    int result = MarkupIterator_next(&it->it_markup, &literal, &field_present,
1007
0
                                     &field_name, &format_spec, &conversion,
1008
0
                                     &format_spec_needs_expanding);
1009
1010
    /* all of the SubString objects point into it->str, so no
1011
       memory management needs to be done on them */
1012
0
    assert(0 <= result && result <= 2);
1013
0
    if (result == 0 || result == 1)
1014
        /* if 0, error has already been set, if 1, iterator is empty */
1015
0
        return NULL;
1016
0
    else {
1017
0
        PyObject *literal_str = NULL;
1018
0
        PyObject *field_name_str = NULL;
1019
0
        PyObject *format_spec_str = NULL;
1020
0
        PyObject *conversion_str = NULL;
1021
0
        PyObject *tuple = NULL;
1022
1023
0
        literal_str = SubString_new_object(&literal);
1024
0
        if (literal_str == NULL)
1025
0
            goto done;
1026
1027
0
        field_name_str = SubString_new_object(&field_name);
1028
0
        if (field_name_str == NULL)
1029
0
            goto done;
1030
1031
        /* if field_name is non-zero length, return a string for
1032
           format_spec (even if zero length), else return None */
1033
0
        format_spec_str = (field_present ?
1034
0
                           SubString_new_object_or_empty :
1035
0
                           SubString_new_object)(&format_spec);
1036
0
        if (format_spec_str == NULL)
1037
0
            goto done;
1038
1039
        /* if the conversion is not specified, return a None,
1040
           otherwise create a one length string with the conversion
1041
           character */
1042
0
        if (conversion == '\0') {
1043
0
            conversion_str = Py_NewRef(Py_None);
1044
0
        }
1045
0
        else
1046
0
            conversion_str = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
1047
0
                                                       &conversion, 1);
1048
0
        if (conversion_str == NULL)
1049
0
            goto done;
1050
1051
0
        tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str,
1052
0
                             conversion_str);
1053
0
    done:
1054
0
        Py_XDECREF(literal_str);
1055
0
        Py_XDECREF(field_name_str);
1056
0
        Py_XDECREF(format_spec_str);
1057
0
        Py_XDECREF(conversion_str);
1058
0
        return tuple;
1059
0
    }
1060
0
}
1061
1062
static PyMethodDef formatteriter_methods[] = {
1063
    {NULL,              NULL}           /* sentinel */
1064
};
1065
1066
static PyTypeObject PyFormatterIter_Type = {
1067
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
1068
    "formatteriterator",                /* tp_name */
1069
    sizeof(formatteriterobject),        /* tp_basicsize */
1070
    0,                                  /* tp_itemsize */
1071
    /* methods */
1072
    formatteriter_dealloc,              /* tp_dealloc */
1073
    0,                                  /* tp_vectorcall_offset */
1074
    0,                                  /* tp_getattr */
1075
    0,                                  /* tp_setattr */
1076
    0,                                  /* tp_as_async */
1077
    0,                                  /* tp_repr */
1078
    0,                                  /* tp_as_number */
1079
    0,                                  /* tp_as_sequence */
1080
    0,                                  /* tp_as_mapping */
1081
    0,                                  /* tp_hash */
1082
    0,                                  /* tp_call */
1083
    0,                                  /* tp_str */
1084
    PyObject_GenericGetAttr,            /* tp_getattro */
1085
    0,                                  /* tp_setattro */
1086
    0,                                  /* tp_as_buffer */
1087
    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1088
    0,                                  /* tp_doc */
1089
    0,                                  /* tp_traverse */
1090
    0,                                  /* tp_clear */
1091
    0,                                  /* tp_richcompare */
1092
    0,                                  /* tp_weaklistoffset */
1093
    PyObject_SelfIter,                  /* tp_iter */
1094
    formatteriter_next,                 /* tp_iternext */
1095
    formatteriter_methods,              /* tp_methods */
1096
    0,
1097
};
1098
1099
/* unicode_formatter_parser is used to implement
1100
   string.Formatter.vformat.  it parses a string and returns tuples
1101
   describing the parsed elements.  It's a wrapper around
1102
   stringlib/string_format.h's MarkupIterator */
1103
static PyObject *
1104
formatter_parser(PyObject *Py_UNUSED(module), PyObject *self)
1105
0
{
1106
0
    formatteriterobject *it;
1107
1108
0
    if (!PyUnicode_Check(self)) {
1109
0
        PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1110
0
        return NULL;
1111
0
    }
1112
1113
0
    it = PyObject_New(formatteriterobject, &PyFormatterIter_Type);
1114
0
    if (it == NULL)
1115
0
        return NULL;
1116
1117
    /* take ownership, give the object to the iterator */
1118
0
    it->str = Py_NewRef(self);
1119
1120
    /* initialize the contained MarkupIterator */
1121
0
    MarkupIterator_init(&it->it_markup, (PyObject*)self, 0, PyUnicode_GET_LENGTH(self));
1122
0
    return (PyObject *)it;
1123
0
}
1124
1125
1126
/************************************************************************/
1127
/*********** fieldnameiterator ******************************************/
1128
/************************************************************************/
1129
1130
1131
/* This is used to implement string.Formatter.vparse().  It parses the
1132
   field name into attribute and item values.  It's a Python-callable
1133
   wrapper around FieldNameIterator */
1134
1135
typedef struct {
1136
    PyObject_HEAD
1137
    PyObject *str;
1138
    FieldNameIterator it_field;
1139
} fieldnameiterobject;
1140
1141
static void
1142
fieldnameiter_dealloc(PyObject *op)
1143
0
{
1144
0
    fieldnameiterobject *it = (fieldnameiterobject*)op;
1145
0
    Py_XDECREF(it->str);
1146
0
    PyObject_Free(it);
1147
0
}
1148
1149
/* returns a tuple:
1150
   (is_attr, value)
1151
   is_attr is true if we used attribute syntax (e.g., '.foo')
1152
              false if we used index syntax (e.g., '[foo]')
1153
   value is an integer or string
1154
*/
1155
static PyObject *
1156
fieldnameiter_next(PyObject *op)
1157
0
{
1158
0
    fieldnameiterobject *it = (fieldnameiterobject*)op;
1159
0
    int result;
1160
0
    int is_attr;
1161
0
    Py_ssize_t idx;
1162
0
    SubString name;
1163
1164
0
    result = FieldNameIterator_next(&it->it_field, &is_attr,
1165
0
                                    &idx, &name);
1166
0
    if (result == 0 || result == 1)
1167
        /* if 0, error has already been set, if 1, iterator is empty */
1168
0
        return NULL;
1169
0
    else {
1170
0
        PyObject* result = NULL;
1171
0
        PyObject* is_attr_obj = NULL;
1172
0
        PyObject* obj = NULL;
1173
1174
0
        is_attr_obj = PyBool_FromLong(is_attr);
1175
0
        if (is_attr_obj == NULL)
1176
0
            goto error;
1177
1178
        /* either an integer or a string */
1179
0
        if (idx != -1)
1180
0
            obj = PyLong_FromSsize_t(idx);
1181
0
        else
1182
0
            obj = SubString_new_object(&name);
1183
0
        if (obj == NULL)
1184
0
            goto error;
1185
1186
        /* return a tuple of values */
1187
0
        return _PyTuple_FromPairSteal(is_attr_obj, obj);
1188
1189
0
    error:
1190
0
        Py_XDECREF(is_attr_obj);
1191
0
        Py_XDECREF(obj);
1192
0
        return result;
1193
0
    }
1194
0
}
1195
1196
static PyMethodDef fieldnameiter_methods[] = {
1197
    {NULL,              NULL}           /* sentinel */
1198
};
1199
1200
static PyTypeObject PyFieldNameIter_Type = {
1201
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
1202
    "fieldnameiterator",                /* tp_name */
1203
    sizeof(fieldnameiterobject),        /* tp_basicsize */
1204
    0,                                  /* tp_itemsize */
1205
    /* methods */
1206
    fieldnameiter_dealloc,              /* tp_dealloc */
1207
    0,                                  /* tp_vectorcall_offset */
1208
    0,                                  /* tp_getattr */
1209
    0,                                  /* tp_setattr */
1210
    0,                                  /* tp_as_async */
1211
    0,                                  /* tp_repr */
1212
    0,                                  /* tp_as_number */
1213
    0,                                  /* tp_as_sequence */
1214
    0,                                  /* tp_as_mapping */
1215
    0,                                  /* tp_hash */
1216
    0,                                  /* tp_call */
1217
    0,                                  /* tp_str */
1218
    PyObject_GenericGetAttr,            /* tp_getattro */
1219
    0,                                  /* tp_setattro */
1220
    0,                                  /* tp_as_buffer */
1221
    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1222
    0,                                  /* tp_doc */
1223
    0,                                  /* tp_traverse */
1224
    0,                                  /* tp_clear */
1225
    0,                                  /* tp_richcompare */
1226
    0,                                  /* tp_weaklistoffset */
1227
    PyObject_SelfIter,                  /* tp_iter */
1228
    fieldnameiter_next,                 /* tp_iternext */
1229
    fieldnameiter_methods,              /* tp_methods */
1230
    0};
1231
1232
/* unicode_formatter_field_name_split is used to implement
1233
   string.Formatter.vformat.  it takes a PEP 3101 "field name", and
1234
   returns a tuple of (first, rest): "first", the part before the
1235
   first '.' or '['; and "rest", an iterator for the rest of the field
1236
   name.  it's a wrapper around stringlib/string_format.h's
1237
   field_name_split.  The iterator it returns is a
1238
   FieldNameIterator */
1239
static PyObject *
1240
formatter_field_name_split(PyObject *Py_UNUSED(module), PyObject *self)
1241
0
{
1242
0
    SubString first;
1243
0
    Py_ssize_t first_idx;
1244
0
    fieldnameiterobject *it;
1245
1246
0
    PyObject *first_obj = NULL;
1247
0
    PyObject *result = NULL;
1248
1249
0
    if (!PyUnicode_Check(self)) {
1250
0
        PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1251
0
        return NULL;
1252
0
    }
1253
1254
0
    it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type);
1255
0
    if (it == NULL)
1256
0
        return NULL;
1257
1258
    /* take ownership, give the object to the iterator.  this is
1259
       just to keep the field_name alive */
1260
0
    it->str = Py_NewRef(self);
1261
1262
    /* Pass in auto_number = NULL. We'll return an empty string for
1263
       first_obj in that case. */
1264
0
    if (!field_name_split((PyObject*)self, 0, PyUnicode_GET_LENGTH(self),
1265
0
                          &first, &first_idx, &it->it_field, NULL))
1266
0
        goto error;
1267
1268
    /* first becomes an integer, if possible; else a string */
1269
0
    if (first_idx != -1)
1270
0
        first_obj = PyLong_FromSsize_t(first_idx);
1271
0
    else
1272
        /* convert "first" into a string object */
1273
0
        first_obj = SubString_new_object(&first);
1274
0
    if (first_obj == NULL)
1275
0
        goto error;
1276
1277
    /* return a tuple of values */
1278
0
    return _PyTuple_FromPairSteal(first_obj, (PyObject *)it);
1279
1280
0
error:
1281
0
    Py_XDECREF(it);
1282
0
    Py_XDECREF(first_obj);
1283
0
    return result;
1284
0
}