Coverage Report

Created: 2025-08-24 07:03

/src/cpython/Objects/stringlib/unicode_format.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
    unicode_format.h -- implementation of str.format().
3
*/
4
5
#include "pycore_complexobject.h" // _PyComplex_FormatAdvancedWriter()
6
#include "pycore_floatobject.h"   // _PyFloat_FormatAdvancedWriter()
7
8
/************************************************************************/
9
/***********   Global data structures and forward declarations  *********/
10
/************************************************************************/
11
12
/*
13
   A SubString consists of the characters between two string or
14
   unicode pointers.
15
*/
16
typedef struct {
17
    PyObject *str; /* borrowed reference */
18
    Py_ssize_t start, end;
19
} SubString;
20
21
22
typedef enum {
23
    ANS_INIT,
24
    ANS_AUTO,
25
    ANS_MANUAL
26
} AutoNumberState;   /* Keep track if we're auto-numbering fields */
27
28
/* Keeps track of our auto-numbering state, and which number field we're on */
29
typedef struct {
30
    AutoNumberState an_state;
31
    int an_field_number;
32
} AutoNumber;
33
34
35
/* forward declaration for recursion */
36
static PyObject *
37
build_string(SubString *input, PyObject *args, PyObject *kwargs,
38
             int recursion_depth, AutoNumber *auto_number);
39
40
41
42
/************************************************************************/
43
/**************************  Utility  functions  ************************/
44
/************************************************************************/
45
46
static void
47
AutoNumber_Init(AutoNumber *auto_number)
48
8.47M
{
49
8.47M
    auto_number->an_state = ANS_INIT;
50
8.47M
    auto_number->an_field_number = 0;
51
8.47M
}
52
53
/* fill in a SubString from a pointer and length */
54
Py_LOCAL_INLINE(void)
55
SubString_init(SubString *str, PyObject *s, Py_ssize_t start, Py_ssize_t end)
56
160M
{
57
160M
    str->str = s;
58
160M
    str->start = start;
59
160M
    str->end = end;
60
160M
}
61
62
/* return a new string.  if str->str is NULL, return None */
63
Py_LOCAL_INLINE(PyObject *)
64
SubString_new_object(SubString *str)
65
42
{
66
42
    if (str->str == NULL)
67
0
        Py_RETURN_NONE;
68
42
    return PyUnicode_Substring(str->str, str->start, str->end);
69
42
}
70
71
/* return a new string.  if str->str is NULL, return a new empty string */
72
Py_LOCAL_INLINE(PyObject *)
73
SubString_new_object_or_empty(SubString *str)
74
0
{
75
0
    if (str->str == NULL) {
76
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_STR);
77
0
    }
78
0
    return SubString_new_object(str);
79
0
}
80
81
/* Return 1 if an error has been detected switching between automatic
82
   field numbering and manual field specification, else return 0. Set
83
   ValueError on error. */
84
static int
85
autonumber_state_error(AutoNumberState state, int field_name_is_empty)
86
15.9M
{
87
15.9M
    if (state == ANS_MANUAL) {
88
64
        if (field_name_is_empty) {
89
0
            PyErr_SetString(PyExc_ValueError, "cannot switch from "
90
0
                            "manual field specification to "
91
0
                            "automatic field numbering");
92
0
            return 1;
93
0
        }
94
64
    }
95
15.9M
    else {
96
15.9M
        if (!field_name_is_empty) {
97
0
            PyErr_SetString(PyExc_ValueError, "cannot switch from "
98
0
                            "automatic field numbering to "
99
0
                            "manual field specification");
100
0
            return 1;
101
0
        }
102
15.9M
    }
103
15.9M
    return 0;
104
15.9M
}
105
106
107
/************************************************************************/
108
/***********  Format string parsing -- integers and identifiers *********/
109
/************************************************************************/
110
111
static Py_ssize_t
112
get_integer(const SubString *str)
113
15.9M
{
114
15.9M
    Py_ssize_t accumulator = 0;
115
15.9M
    Py_ssize_t digitval;
116
15.9M
    Py_ssize_t i;
117
118
    /* empty string is an error */
119
15.9M
    if (str->start >= str->end)
120
15.9M
        return -1;
121
122
170
    for (i = str->start; i < str->end; i++) {
123
106
        digitval = Py_UNICODE_TODECIMAL(PyUnicode_READ_CHAR(str->str, i));
124
106
        if (digitval < 0)
125
42
            return -1;
126
        /*
127
           Detect possible overflow before it happens:
128
129
              accumulator * 10 + digitval > PY_SSIZE_T_MAX if and only if
130
              accumulator > (PY_SSIZE_T_MAX - digitval) / 10.
131
        */
132
64
        if (accumulator > (PY_SSIZE_T_MAX - digitval) / 10) {
133
0
            PyErr_Format(PyExc_ValueError,
134
0
                         "Too many decimal digits in format string");
135
0
            return -1;
136
0
        }
137
64
        accumulator = accumulator * 10 + digitval;
138
64
    }
139
64
    return accumulator;
140
106
}
141
142
/************************************************************************/
143
/******** Functions to get field objects and specification strings ******/
144
/************************************************************************/
145
146
/* do the equivalent of obj.name */
147
static PyObject *
148
getattr(PyObject *obj, SubString *name)
149
0
{
150
0
    PyObject *newobj;
151
0
    PyObject *str = SubString_new_object(name);
152
0
    if (str == NULL)
153
0
        return NULL;
154
0
    newobj = PyObject_GetAttr(obj, str);
155
0
    Py_DECREF(str);
156
0
    return newobj;
157
0
}
158
159
/* do the equivalent of obj[idx], where obj is a sequence */
160
static PyObject *
161
getitem_sequence(PyObject *obj, Py_ssize_t idx)
162
0
{
163
0
    return PySequence_GetItem(obj, idx);
164
0
}
165
166
/* do the equivalent of obj[idx], where obj is not a sequence */
167
static PyObject *
168
getitem_idx(PyObject *obj, Py_ssize_t idx)
169
0
{
170
0
    PyObject *newobj;
171
0
    PyObject *idx_obj = PyLong_FromSsize_t(idx);
172
0
    if (idx_obj == NULL)
173
0
        return NULL;
174
0
    newobj = PyObject_GetItem(obj, idx_obj);
175
0
    Py_DECREF(idx_obj);
176
0
    return newobj;
177
0
}
178
179
/* do the equivalent of obj[name] */
180
static PyObject *
181
getitem_str(PyObject *obj, SubString *name)
182
0
{
183
0
    PyObject *newobj;
184
0
    PyObject *str = SubString_new_object(name);
185
0
    if (str == NULL)
186
0
        return NULL;
187
0
    newobj = PyObject_GetItem(obj, str);
188
0
    Py_DECREF(str);
189
0
    return newobj;
190
0
}
191
192
typedef struct {
193
    /* the entire string we're parsing.  we assume that someone else
194
       is managing its lifetime, and that it will exist for the
195
       lifetime of the iterator.  can be empty */
196
    SubString str;
197
198
    /* index to where we are inside field_name */
199
    Py_ssize_t index;
200
} FieldNameIterator;
201
202
203
static int
204
FieldNameIterator_init(FieldNameIterator *self, PyObject *s,
205
                       Py_ssize_t start, Py_ssize_t end)
206
15.9M
{
207
15.9M
    SubString_init(&self->str, s, start, end);
208
15.9M
    self->index = start;
209
15.9M
    return 1;
210
15.9M
}
211
212
static int
213
_FieldNameIterator_attr(FieldNameIterator *self, SubString *name)
214
0
{
215
0
    Py_UCS4 c;
216
217
0
    name->str = self->str.str;
218
0
    name->start = self->index;
219
220
    /* return everything until '.' or '[' */
221
0
    while (self->index < self->str.end) {
222
0
        c = PyUnicode_READ_CHAR(self->str.str, self->index++);
223
0
        switch (c) {
224
0
        case '[':
225
0
        case '.':
226
            /* backup so that we this character will be seen next time */
227
0
            self->index--;
228
0
            break;
229
0
        default:
230
0
            continue;
231
0
        }
232
0
        break;
233
0
    }
234
    /* end of string is okay */
235
0
    name->end = self->index;
236
0
    return 1;
237
0
}
238
239
static int
240
_FieldNameIterator_item(FieldNameIterator *self, SubString *name)
241
0
{
242
0
    int bracket_seen = 0;
243
0
    Py_UCS4 c;
244
245
0
    name->str = self->str.str;
246
0
    name->start = self->index;
247
248
    /* return everything until ']' */
249
0
    while (self->index < self->str.end) {
250
0
        c = PyUnicode_READ_CHAR(self->str.str, self->index++);
251
0
        switch (c) {
252
0
        case ']':
253
0
            bracket_seen = 1;
254
0
            break;
255
0
        default:
256
0
            continue;
257
0
        }
258
0
        break;
259
0
    }
260
    /* make sure we ended with a ']' */
261
0
    if (!bracket_seen) {
262
0
        PyErr_SetString(PyExc_ValueError, "Missing ']' in format string");
263
0
        return 0;
264
0
    }
265
266
    /* end of string is okay */
267
    /* don't include the ']' */
268
0
    name->end = self->index-1;
269
0
    return 1;
270
0
}
271
272
/* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */
273
static int
274
FieldNameIterator_next(FieldNameIterator *self, int *is_attribute,
275
                       Py_ssize_t *name_idx, SubString *name)
276
15.9M
{
277
    /* check at end of input */
278
15.9M
    if (self->index >= self->str.end)
279
15.9M
        return 1;
280
281
0
    switch (PyUnicode_READ_CHAR(self->str.str, self->index++)) {
282
0
    case '.':
283
0
        *is_attribute = 1;
284
0
        if (_FieldNameIterator_attr(self, name) == 0)
285
0
            return 0;
286
0
        *name_idx = -1;
287
0
        break;
288
0
    case '[':
289
0
        *is_attribute = 0;
290
0
        if (_FieldNameIterator_item(self, name) == 0)
291
0
            return 0;
292
0
        *name_idx = get_integer(name);
293
0
        if (*name_idx == -1 && PyErr_Occurred())
294
0
            return 0;
295
0
        break;
296
0
    default:
297
        /* Invalid character follows ']' */
298
0
        PyErr_SetString(PyExc_ValueError, "Only '.' or '[' may "
299
0
                        "follow ']' in format field specifier");
300
0
        return 0;
301
0
    }
302
303
    /* empty string is an error */
304
0
    if (name->start == name->end) {
305
0
        PyErr_SetString(PyExc_ValueError, "Empty attribute in format string");
306
0
        return 0;
307
0
    }
308
309
0
    return 2;
310
0
}
311
312
313
/* input: field_name
314
   output: 'first' points to the part before the first '[' or '.'
315
           'first_idx' is -1 if 'first' is not an integer, otherwise
316
                       it's the value of first converted to an integer
317
           'rest' is an iterator to return the rest
318
*/
319
static int
320
field_name_split(PyObject *str, Py_ssize_t start, Py_ssize_t end, SubString *first,
321
                 Py_ssize_t *first_idx, FieldNameIterator *rest,
322
                 AutoNumber *auto_number)
323
15.9M
{
324
15.9M
    Py_UCS4 c;
325
15.9M
    Py_ssize_t i = start;
326
15.9M
    int field_name_is_empty;
327
15.9M
    int using_numeric_index;
328
329
    /* find the part up until the first '.' or '[' */
330
15.9M
    while (i < end) {
331
428
        switch (c = PyUnicode_READ_CHAR(str, i++)) {
332
0
        case '[':
333
0
        case '.':
334
            /* backup so that we this character is available to the
335
               "rest" iterator */
336
0
            i--;
337
0
            break;
338
428
        default:
339
428
            continue;
340
428
        }
341
0
        break;
342
428
    }
343
344
    /* set up the return values */
345
15.9M
    SubString_init(first, str, start, i);
346
15.9M
    FieldNameIterator_init(rest, str, i, end);
347
348
    /* see if "first" is an integer, in which case it's used as an index */
349
15.9M
    *first_idx = get_integer(first);
350
15.9M
    if (*first_idx == -1 && PyErr_Occurred())
351
0
        return 0;
352
353
15.9M
    field_name_is_empty = first->start >= first->end;
354
355
    /* If the field name is omitted or if we have a numeric index
356
       specified, then we're doing numeric indexing into args. */
357
15.9M
    using_numeric_index = field_name_is_empty || *first_idx != -1;
358
359
    /* We always get here exactly one time for each field we're
360
       processing. And we get here in field order (counting by left
361
       braces). So this is the perfect place to handle automatic field
362
       numbering if the field name is omitted. */
363
364
    /* Check if we need to do the auto-numbering. It's not needed if
365
       we're called from string.Format routines, because it's handled
366
       in that class by itself. */
367
15.9M
    if (auto_number) {
368
        /* Initialize our auto numbering state if this is the first
369
           time we're either auto-numbering or manually numbering. */
370
15.9M
        if (auto_number->an_state == ANS_INIT && using_numeric_index)
371
8.47M
            auto_number->an_state = field_name_is_empty ?
372
8.47M
                ANS_AUTO : ANS_MANUAL;
373
374
        /* Make sure our state is consistent with what we're doing
375
           this time through. Only check if we're using a numeric
376
           index. */
377
15.9M
        if (using_numeric_index)
378
15.9M
            if (autonumber_state_error(auto_number->an_state,
379
15.9M
                                       field_name_is_empty))
380
0
                return 0;
381
        /* Zero length field means we want to do auto-numbering of the
382
           fields. */
383
15.9M
        if (field_name_is_empty)
384
15.9M
            *first_idx = (auto_number->an_field_number)++;
385
15.9M
    }
386
387
15.9M
    return 1;
388
15.9M
}
389
390
391
/*
392
    get_field_object returns the object inside {}, before the
393
    format_spec.  It handles getindex and getattr lookups and consumes
394
    the entire input string.
395
*/
396
static PyObject *
397
get_field_object(SubString *input, PyObject *args, PyObject *kwargs,
398
                 AutoNumber *auto_number)
399
15.9M
{
400
15.9M
    PyObject *obj = NULL;
401
15.9M
    int ok;
402
15.9M
    int is_attribute;
403
15.9M
    SubString name;
404
15.9M
    SubString first;
405
15.9M
    Py_ssize_t index;
406
15.9M
    FieldNameIterator rest;
407
408
15.9M
    if (!field_name_split(input->str, input->start, input->end, &first,
409
15.9M
                          &index, &rest, auto_number)) {
410
0
        goto error;
411
0
    }
412
413
15.9M
    if (index == -1) {
414
        /* look up in kwargs */
415
42
        PyObject *key = SubString_new_object(&first);
416
42
        if (key == NULL) {
417
0
            goto error;
418
0
        }
419
42
        if (kwargs == NULL) {
420
0
            PyErr_SetObject(PyExc_KeyError, key);
421
0
            Py_DECREF(key);
422
0
            goto error;
423
0
        }
424
        /* Use PyObject_GetItem instead of PyDict_GetItem because this
425
           code is no longer just used with kwargs. It might be passed
426
           a non-dict when called through format_map. */
427
42
        obj = PyObject_GetItem(kwargs, key);
428
42
        Py_DECREF(key);
429
42
        if (obj == NULL) {
430
0
            goto error;
431
0
        }
432
42
    }
433
15.9M
    else {
434
        /* If args is NULL, we have a format string with a positional field
435
           with only kwargs to retrieve it from. This can only happen when
436
           used with format_map(), where positional arguments are not
437
           allowed. */
438
15.9M
        if (args == NULL) {
439
0
            PyErr_SetString(PyExc_ValueError, "Format string contains "
440
0
                            "positional fields");
441
0
            goto error;
442
0
        }
443
444
        /* look up in args */
445
15.9M
        obj = PySequence_GetItem(args, index);
446
15.9M
        if (obj == NULL) {
447
0
            PyErr_Format(PyExc_IndexError,
448
0
                         "Replacement index %zd out of range for positional "
449
0
                         "args tuple",
450
0
                         index);
451
0
             goto error;
452
0
        }
453
15.9M
    }
454
455
    /* iterate over the rest of the field_name */
456
15.9M
    while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index,
457
15.9M
                                        &name)) == 2) {
458
0
        PyObject *tmp;
459
460
0
        if (is_attribute)
461
            /* getattr lookup "." */
462
0
            tmp = getattr(obj, &name);
463
0
        else
464
            /* getitem lookup "[]" */
465
0
            if (index == -1)
466
0
                tmp = getitem_str(obj, &name);
467
0
            else
468
0
                if (PySequence_Check(obj))
469
0
                    tmp = getitem_sequence(obj, index);
470
0
                else
471
                    /* not a sequence */
472
0
                    tmp = getitem_idx(obj, index);
473
0
        if (tmp == NULL)
474
0
            goto error;
475
476
        /* assign to obj */
477
0
        Py_SETREF(obj, tmp);
478
0
    }
479
    /* end of iterator, this is the non-error case */
480
15.9M
    if (ok == 1)
481
15.9M
        return obj;
482
0
error:
483
0
    Py_XDECREF(obj);
484
0
    return NULL;
485
15.9M
}
486
487
/************************************************************************/
488
/*****************  Field rendering functions  **************************/
489
/************************************************************************/
490
491
/*
492
    render_field() is the main function in this section.  It takes the
493
    field object and field specification string generated by
494
    get_field_and_spec, and renders the field into the output string.
495
496
    render_field calls fieldobj.__format__(format_spec) method, and
497
    appends to the output.
498
*/
499
static int
500
render_field(PyObject *fieldobj, SubString *format_spec, _PyUnicodeWriter *writer)
501
15.9M
{
502
15.9M
    int ok = 0;
503
15.9M
    PyObject *result = NULL;
504
15.9M
    PyObject *format_spec_object = NULL;
505
15.9M
    int (*formatter) (_PyUnicodeWriter*, PyObject *, PyObject *, Py_ssize_t, Py_ssize_t) = NULL;
506
15.9M
    int err;
507
508
    /* If we know the type exactly, skip the lookup of __format__ and just
509
       call the formatter directly. */
510
15.9M
    if (PyUnicode_CheckExact(fieldobj))
511
15.6M
        formatter = _PyUnicode_FormatAdvancedWriter;
512
241k
    else if (PyLong_CheckExact(fieldobj))
513
73.3k
        formatter = _PyLong_FormatAdvancedWriter;
514
168k
    else if (PyFloat_CheckExact(fieldobj))
515
0
        formatter = _PyFloat_FormatAdvancedWriter;
516
168k
    else if (PyComplex_CheckExact(fieldobj))
517
0
        formatter = _PyComplex_FormatAdvancedWriter;
518
519
15.9M
    if (formatter) {
520
        /* we know exactly which formatter will be called when __format__ is
521
           looked up, so call it directly, instead. */
522
15.7M
        err = formatter(writer, fieldobj, format_spec->str,
523
15.7M
                        format_spec->start, format_spec->end);
524
15.7M
        return (err == 0);
525
15.7M
    }
526
168k
    else {
527
        /* We need to create an object out of the pointers we have, because
528
           __format__ takes a string/unicode object for format_spec. */
529
168k
        if (format_spec->str)
530
0
            format_spec_object = PyUnicode_Substring(format_spec->str,
531
0
                                                     format_spec->start,
532
0
                                                     format_spec->end);
533
168k
        else
534
168k
            format_spec_object = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
535
168k
        if (format_spec_object == NULL)
536
0
            goto done;
537
538
168k
        result = PyObject_Format(fieldobj, format_spec_object);
539
168k
    }
540
168k
    if (result == NULL)
541
1
        goto done;
542
543
168k
    if (_PyUnicodeWriter_WriteStr(writer, result) == -1)
544
0
        goto done;
545
168k
    ok = 1;
546
547
168k
done:
548
168k
    Py_XDECREF(format_spec_object);
549
168k
    Py_XDECREF(result);
550
168k
    return ok;
551
168k
}
552
553
static int
554
parse_field(SubString *str, SubString *field_name, SubString *format_spec,
555
            int *format_spec_needs_expanding, Py_UCS4 *conversion)
556
15.9M
{
557
    /* Note this function works if the field name is zero length,
558
       which is good.  Zero length field names are handled later, in
559
       field_name_split. */
560
561
15.9M
    Py_UCS4 c = 0;
562
563
    /* initialize these, as they may be empty */
564
15.9M
    *conversion = '\0';
565
15.9M
    SubString_init(format_spec, NULL, 0, 0);
566
567
    /* Search for the field name.  it's terminated by the end of
568
       the string, or a ':' or '!' */
569
15.9M
    field_name->str = str->str;
570
15.9M
    field_name->start = str->start;
571
15.9M
    while (str->start < str->end) {
572
15.9M
        switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
573
0
        case '{':
574
0
            PyErr_SetString(PyExc_ValueError, "unexpected '{' in field name");
575
0
            return 0;
576
0
        case '[':
577
0
            for (; str->start < str->end; str->start++)
578
0
                if (PyUnicode_READ_CHAR(str->str, str->start) == ']')
579
0
                    break;
580
0
            continue;
581
15.3M
        case '}':
582
15.3M
        case ':':
583
15.9M
        case '!':
584
15.9M
            break;
585
428
        default:
586
428
            continue;
587
15.9M
        }
588
15.9M
        break;
589
15.9M
    }
590
591
15.9M
    field_name->end = str->start - 1;
592
15.9M
    if (c == '!' || c == ':') {
593
550k
        Py_ssize_t count;
594
        /* we have a format specifier and/or a conversion */
595
        /* don't include the last character */
596
597
        /* see if there's a conversion specifier */
598
550k
        if (c == '!') {
599
            /* there must be another character present */
600
550k
            if (str->start >= str->end) {
601
0
                PyErr_SetString(PyExc_ValueError,
602
0
                                "end of string while looking for conversion "
603
0
                                "specifier");
604
0
                return 0;
605
0
            }
606
550k
            *conversion = PyUnicode_READ_CHAR(str->str, str->start++);
607
608
550k
            if (str->start < str->end) {
609
550k
                c = PyUnicode_READ_CHAR(str->str, str->start++);
610
550k
                if (c == '}')
611
550k
                    return 1;
612
0
                if (c != ':') {
613
0
                    PyErr_SetString(PyExc_ValueError,
614
0
                                    "expected ':' after conversion specifier");
615
0
                    return 0;
616
0
                }
617
0
            }
618
550k
        }
619
64
        format_spec->str = str->str;
620
64
        format_spec->start = str->start;
621
64
        count = 1;
622
256
        while (str->start < str->end) {
623
256
            switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
624
0
            case '{':
625
0
                *format_spec_needs_expanding = 1;
626
0
                count++;
627
0
                break;
628
64
            case '}':
629
64
                count--;
630
64
                if (count == 0) {
631
64
                    format_spec->end = str->start - 1;
632
64
                    return 1;
633
64
                }
634
0
                break;
635
192
            default:
636
192
                break;
637
256
            }
638
256
        }
639
640
0
        PyErr_SetString(PyExc_ValueError, "unmatched '{' in format spec");
641
0
        return 0;
642
64
    }
643
15.3M
    else if (c != '}') {
644
0
        PyErr_SetString(PyExc_ValueError, "expected '}' before end of string");
645
0
        return 0;
646
0
    }
647
648
15.3M
    return 1;
649
15.9M
}
650
651
/************************************************************************/
652
/******* Output string allocation and escape-to-markup processing  ******/
653
/************************************************************************/
654
655
/* MarkupIterator breaks the string into pieces of either literal
656
   text, or things inside {} that need to be marked up.  it is
657
   designed to make it easy to wrap a Python iterator around it, for
658
   use with the Formatter class */
659
660
typedef struct {
661
    SubString str;
662
} MarkupIterator;
663
664
static int
665
MarkupIterator_init(MarkupIterator *self, PyObject *str,
666
                    Py_ssize_t start, Py_ssize_t end)
667
8.47M
{
668
8.47M
    SubString_init(&self->str, str, start, end);
669
8.47M
    return 1;
670
8.47M
}
671
672
/* returns 0 on error, 1 on non-error termination, and 2 if it got a
673
   string (or something to be expanded) */
674
static int
675
MarkupIterator_next(MarkupIterator *self, SubString *literal,
676
                    int *field_present, SubString *field_name,
677
                    SubString *format_spec, Py_UCS4 *conversion,
678
                    int *format_spec_needs_expanding)
679
32.0M
{
680
32.0M
    int at_end;
681
32.0M
    Py_UCS4 c = 0;
682
32.0M
    Py_ssize_t start;
683
32.0M
    Py_ssize_t len;
684
32.0M
    int markup_follows = 0;
685
686
    /* initialize all of the output variables */
687
32.0M
    SubString_init(literal, NULL, 0, 0);
688
32.0M
    SubString_init(field_name, NULL, 0, 0);
689
32.0M
    SubString_init(format_spec, NULL, 0, 0);
690
32.0M
    *conversion = '\0';
691
32.0M
    *format_spec_needs_expanding = 0;
692
32.0M
    *field_present = 0;
693
694
    /* No more input, end of iterator.  This is the normal exit
695
       path. */
696
32.0M
    if (self->str.start >= self->str.end)
697
8.47M
        return 1;
698
699
23.5M
    start = self->str.start;
700
701
    /* First read any literal text. Read until the end of string, an
702
       escaped '{' or '}', or an unescaped '{'.  In order to never
703
       allocate memory and so I can just pass pointers around, if
704
       there's an escaped '{' or '}' then we'll return the literal
705
       including the brace, but no format object.  The next time
706
       through, we'll return the rest of the literal, skipping past
707
       the second consecutive brace. */
708
72.0M
    while (self->str.start < self->str.end) {
709
64.3M
        switch (c = PyUnicode_READ_CHAR(self->str.str, self->str.start++)) {
710
15.9M
        case '{':
711
15.9M
        case '}':
712
15.9M
            markup_follows = 1;
713
15.9M
            break;
714
48.4M
        default:
715
48.4M
            continue;
716
64.3M
        }
717
15.9M
        break;
718
64.3M
    }
719
720
23.5M
    at_end = self->str.start >= self->str.end;
721
23.5M
    len = self->str.start - start;
722
723
23.5M
    if ((c == '}') && (at_end ||
724
0
                       (c != PyUnicode_READ_CHAR(self->str.str,
725
0
                                                 self->str.start)))) {
726
0
        PyErr_SetString(PyExc_ValueError, "Single '}' encountered "
727
0
                        "in format string");
728
0
        return 0;
729
0
    }
730
23.5M
    if (at_end && c == '{') {
731
0
        PyErr_SetString(PyExc_ValueError, "Single '{' encountered "
732
0
                        "in format string");
733
0
        return 0;
734
0
    }
735
23.5M
    if (!at_end) {
736
15.9M
        if (c == PyUnicode_READ_CHAR(self->str.str, self->str.start)) {
737
            /* escaped } or {, skip it in the input.  there is no
738
               markup object following us, just this literal text */
739
0
            self->str.start++;
740
0
            markup_follows = 0;
741
0
        }
742
15.9M
        else
743
15.9M
            len--;
744
15.9M
    }
745
746
    /* record the literal text */
747
23.5M
    literal->str = self->str.str;
748
23.5M
    literal->start = start;
749
23.5M
    literal->end = start + len;
750
751
23.5M
    if (!markup_follows)
752
7.67M
        return 2;
753
754
    /* this is markup; parse the field */
755
15.9M
    *field_present = 1;
756
15.9M
    if (!parse_field(&self->str, field_name, format_spec,
757
15.9M
                     format_spec_needs_expanding, conversion))
758
0
        return 0;
759
15.9M
    return 2;
760
15.9M
}
761
762
763
/* do the !r or !s conversion on obj */
764
static PyObject *
765
do_conversion(PyObject *obj, Py_UCS4 conversion)
766
550k
{
767
    /* XXX in pre-3.0, do we need to convert this to unicode, since it
768
       might have returned a string? */
769
550k
    switch (conversion) {
770
550k
    case 'r':
771
550k
        return PyObject_Repr(obj);
772
0
    case 's':
773
0
        return PyObject_Str(obj);
774
0
    case 'a':
775
0
        return PyObject_ASCII(obj);
776
0
    default:
777
0
        if (conversion > 32 && conversion < 127) {
778
                /* It's the ASCII subrange; casting to char is safe
779
                   (assuming the execution character set is an ASCII
780
                   superset). */
781
0
                PyErr_Format(PyExc_ValueError,
782
0
                     "Unknown conversion specifier %c",
783
0
                     (char)conversion);
784
0
        } else
785
0
                PyErr_Format(PyExc_ValueError,
786
0
                     "Unknown conversion specifier \\x%x",
787
0
                     (unsigned int)conversion);
788
0
        return NULL;
789
550k
    }
790
550k
}
791
792
/* given:
793
794
   {field_name!conversion:format_spec}
795
796
   compute the result and write it to output.
797
   format_spec_needs_expanding is an optimization.  if it's false,
798
   just output the string directly, otherwise recursively expand the
799
   format_spec string.
800
801
   field_name is allowed to be zero length, in which case we
802
   are doing auto field numbering.
803
*/
804
805
static int
806
output_markup(SubString *field_name, SubString *format_spec,
807
              int format_spec_needs_expanding, Py_UCS4 conversion,
808
              _PyUnicodeWriter *writer, PyObject *args, PyObject *kwargs,
809
              int recursion_depth, AutoNumber *auto_number)
810
15.9M
{
811
15.9M
    PyObject *tmp = NULL;
812
15.9M
    PyObject *fieldobj = NULL;
813
15.9M
    SubString expanded_format_spec;
814
15.9M
    SubString *actual_format_spec;
815
15.9M
    int result = 0;
816
817
    /* convert field_name to an object */
818
15.9M
    fieldobj = get_field_object(field_name, args, kwargs, auto_number);
819
15.9M
    if (fieldobj == NULL)
820
0
        goto done;
821
822
15.9M
    if (conversion != '\0') {
823
550k
        tmp = do_conversion(fieldobj, conversion);
824
550k
        if (tmp == NULL)
825
0
            goto done;
826
827
        /* do the assignment, transferring ownership: fieldobj = tmp */
828
550k
        Py_SETREF(fieldobj, tmp);
829
550k
        tmp = NULL;
830
550k
    }
831
832
    /* if needed, recursively compute the format_spec */
833
15.9M
    if (format_spec_needs_expanding) {
834
0
        tmp = build_string(format_spec, args, kwargs, recursion_depth-1,
835
0
                           auto_number);
836
0
        if (tmp == NULL)
837
0
            goto done;
838
839
        /* note that in the case we're expanding the format string,
840
           tmp must be kept around until after the call to
841
           render_field. */
842
0
        SubString_init(&expanded_format_spec, tmp, 0, PyUnicode_GET_LENGTH(tmp));
843
0
        actual_format_spec = &expanded_format_spec;
844
0
    }
845
15.9M
    else
846
15.9M
        actual_format_spec = format_spec;
847
848
15.9M
    if (render_field(fieldobj, actual_format_spec, writer) == 0)
849
1
        goto done;
850
851
15.9M
    result = 1;
852
853
15.9M
done:
854
15.9M
    Py_XDECREF(fieldobj);
855
15.9M
    Py_XDECREF(tmp);
856
857
15.9M
    return result;
858
15.9M
}
859
860
/*
861
    do_markup is the top-level loop for the format() method.  It
862
    searches through the format string for escapes to markup codes, and
863
    calls other functions to move non-markup text to the output,
864
    and to perform the markup to the output.
865
*/
866
static int
867
do_markup(SubString *input, PyObject *args, PyObject *kwargs,
868
          _PyUnicodeWriter *writer, int recursion_depth, AutoNumber *auto_number)
869
8.47M
{
870
8.47M
    MarkupIterator iter;
871
8.47M
    int format_spec_needs_expanding;
872
8.47M
    int result;
873
8.47M
    int field_present;
874
8.47M
    SubString literal;
875
8.47M
    SubString field_name;
876
8.47M
    SubString format_spec;
877
8.47M
    Py_UCS4 conversion;
878
879
8.47M
    MarkupIterator_init(&iter, input->str, input->start, input->end);
880
32.0M
    while ((result = MarkupIterator_next(&iter, &literal, &field_present,
881
32.0M
                                         &field_name, &format_spec,
882
32.0M
                                         &conversion,
883
32.0M
                                         &format_spec_needs_expanding)) == 2) {
884
23.5M
        if (literal.end != literal.start) {
885
16.1M
            if (!field_present && iter.str.start == iter.str.end)
886
7.67M
                writer->overallocate = 0;
887
16.1M
            if (_PyUnicodeWriter_WriteSubstring(writer, literal.str,
888
16.1M
                                                literal.start, literal.end) < 0)
889
0
                return 0;
890
16.1M
        }
891
892
23.5M
        if (field_present) {
893
15.9M
            if (iter.str.start == iter.str.end)
894
797k
                writer->overallocate = 0;
895
15.9M
            if (!output_markup(&field_name, &format_spec,
896
15.9M
                               format_spec_needs_expanding, conversion, writer,
897
15.9M
                               args, kwargs, recursion_depth, auto_number))
898
1
                return 0;
899
15.9M
        }
900
23.5M
    }
901
8.47M
    return result;
902
8.47M
}
903
904
905
/*
906
    build_string allocates the output string and then
907
    calls do_markup to do the heavy lifting.
908
*/
909
static PyObject *
910
build_string(SubString *input, PyObject *args, PyObject *kwargs,
911
             int recursion_depth, AutoNumber *auto_number)
912
8.47M
{
913
8.47M
    _PyUnicodeWriter writer;
914
915
    /* check the recursion level */
916
8.47M
    if (recursion_depth <= 0) {
917
0
        PyErr_SetString(PyExc_ValueError,
918
0
                        "Max string recursion exceeded");
919
0
        return NULL;
920
0
    }
921
922
8.47M
    _PyUnicodeWriter_Init(&writer);
923
8.47M
    writer.overallocate = 1;
924
8.47M
    writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
925
926
8.47M
    if (!do_markup(input, args, kwargs, &writer, recursion_depth,
927
8.47M
                   auto_number)) {
928
1
        _PyUnicodeWriter_Dealloc(&writer);
929
1
        return NULL;
930
1
    }
931
932
8.47M
    return _PyUnicodeWriter_Finish(&writer);
933
8.47M
}
934
935
/************************************************************************/
936
/*********** main routine ***********************************************/
937
/************************************************************************/
938
939
/* this is the main entry point */
940
static PyObject *
941
do_string_format(PyObject *self, PyObject *args, PyObject *kwargs)
942
8.47M
{
943
8.47M
    SubString input;
944
945
    /* PEP 3101 says only 2 levels, so that
946
       "{0:{1}}".format('abc', 's')            # works
947
       "{0:{1:{2}}}".format('abc', 's', '')    # fails
948
    */
949
8.47M
    int recursion_depth = 2;
950
951
8.47M
    AutoNumber auto_number;
952
8.47M
    AutoNumber_Init(&auto_number);
953
8.47M
    SubString_init(&input, self, 0, PyUnicode_GET_LENGTH(self));
954
8.47M
    return build_string(&input, args, kwargs, recursion_depth, &auto_number);
955
8.47M
}
956
957
static PyObject *
958
do_string_format_map(PyObject *self, PyObject *obj)
959
0
{
960
0
    return do_string_format(self, NULL, obj);
961
0
}
962
963
964
/************************************************************************/
965
/*********** formatteriterator ******************************************/
966
/************************************************************************/
967
968
/* This is used to implement string.Formatter.vparse().  It exists so
969
   Formatter can share code with the built in unicode.format() method.
970
   It's really just a wrapper around MarkupIterator that is callable
971
   from Python. */
972
973
typedef struct {
974
    PyObject_HEAD
975
    PyObject *str;
976
    MarkupIterator it_markup;
977
} formatteriterobject;
978
979
static void
980
formatteriter_dealloc(PyObject *op)
981
0
{
982
0
    formatteriterobject *it = (formatteriterobject*)op;
983
0
    Py_XDECREF(it->str);
984
0
    PyObject_Free(it);
985
0
}
986
987
/* returns a tuple:
988
   (literal, field_name, format_spec, conversion)
989
990
   literal is any literal text to output.  might be zero length
991
   field_name is the string before the ':'.  might be None
992
   format_spec is the string after the ':'.  mibht be None
993
   conversion is either None, or the string after the '!'
994
*/
995
static PyObject *
996
formatteriter_next(PyObject *op)
997
0
{
998
0
    formatteriterobject *it = (formatteriterobject*)op;
999
0
    SubString literal;
1000
0
    SubString field_name;
1001
0
    SubString format_spec;
1002
0
    Py_UCS4 conversion;
1003
0
    int format_spec_needs_expanding;
1004
0
    int field_present;
1005
0
    int result = MarkupIterator_next(&it->it_markup, &literal, &field_present,
1006
0
                                     &field_name, &format_spec, &conversion,
1007
0
                                     &format_spec_needs_expanding);
1008
1009
    /* all of the SubString objects point into it->str, so no
1010
       memory management needs to be done on them */
1011
0
    assert(0 <= result && result <= 2);
1012
0
    if (result == 0 || result == 1)
1013
        /* if 0, error has already been set, if 1, iterator is empty */
1014
0
        return NULL;
1015
0
    else {
1016
0
        PyObject *literal_str = NULL;
1017
0
        PyObject *field_name_str = NULL;
1018
0
        PyObject *format_spec_str = NULL;
1019
0
        PyObject *conversion_str = NULL;
1020
0
        PyObject *tuple = NULL;
1021
1022
0
        literal_str = SubString_new_object(&literal);
1023
0
        if (literal_str == NULL)
1024
0
            goto done;
1025
1026
0
        field_name_str = SubString_new_object(&field_name);
1027
0
        if (field_name_str == NULL)
1028
0
            goto done;
1029
1030
        /* if field_name is non-zero length, return a string for
1031
           format_spec (even if zero length), else return None */
1032
0
        format_spec_str = (field_present ?
1033
0
                           SubString_new_object_or_empty :
1034
0
                           SubString_new_object)(&format_spec);
1035
0
        if (format_spec_str == NULL)
1036
0
            goto done;
1037
1038
        /* if the conversion is not specified, return a None,
1039
           otherwise create a one length string with the conversion
1040
           character */
1041
0
        if (conversion == '\0') {
1042
0
            conversion_str = Py_NewRef(Py_None);
1043
0
        }
1044
0
        else
1045
0
            conversion_str = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
1046
0
                                                       &conversion, 1);
1047
0
        if (conversion_str == NULL)
1048
0
            goto done;
1049
1050
0
        tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str,
1051
0
                             conversion_str);
1052
0
    done:
1053
0
        Py_XDECREF(literal_str);
1054
0
        Py_XDECREF(field_name_str);
1055
0
        Py_XDECREF(format_spec_str);
1056
0
        Py_XDECREF(conversion_str);
1057
0
        return tuple;
1058
0
    }
1059
0
}
1060
1061
static PyMethodDef formatteriter_methods[] = {
1062
    {NULL,              NULL}           /* sentinel */
1063
};
1064
1065
static PyTypeObject PyFormatterIter_Type = {
1066
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
1067
    "formatteriterator",                /* tp_name */
1068
    sizeof(formatteriterobject),        /* tp_basicsize */
1069
    0,                                  /* tp_itemsize */
1070
    /* methods */
1071
    formatteriter_dealloc,              /* tp_dealloc */
1072
    0,                                  /* tp_vectorcall_offset */
1073
    0,                                  /* tp_getattr */
1074
    0,                                  /* tp_setattr */
1075
    0,                                  /* tp_as_async */
1076
    0,                                  /* tp_repr */
1077
    0,                                  /* tp_as_number */
1078
    0,                                  /* tp_as_sequence */
1079
    0,                                  /* tp_as_mapping */
1080
    0,                                  /* tp_hash */
1081
    0,                                  /* tp_call */
1082
    0,                                  /* tp_str */
1083
    PyObject_GenericGetAttr,            /* tp_getattro */
1084
    0,                                  /* tp_setattro */
1085
    0,                                  /* tp_as_buffer */
1086
    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1087
    0,                                  /* tp_doc */
1088
    0,                                  /* tp_traverse */
1089
    0,                                  /* tp_clear */
1090
    0,                                  /* tp_richcompare */
1091
    0,                                  /* tp_weaklistoffset */
1092
    PyObject_SelfIter,                  /* tp_iter */
1093
    formatteriter_next,                 /* tp_iternext */
1094
    formatteriter_methods,              /* tp_methods */
1095
    0,
1096
};
1097
1098
/* unicode_formatter_parser is used to implement
1099
   string.Formatter.vformat.  it parses a string and returns tuples
1100
   describing the parsed elements.  It's a wrapper around
1101
   stringlib/string_format.h's MarkupIterator */
1102
static PyObject *
1103
formatter_parser(PyObject *Py_UNUSED(module), PyObject *self)
1104
0
{
1105
0
    formatteriterobject *it;
1106
1107
0
    if (!PyUnicode_Check(self)) {
1108
0
        PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1109
0
        return NULL;
1110
0
    }
1111
1112
0
    it = PyObject_New(formatteriterobject, &PyFormatterIter_Type);
1113
0
    if (it == NULL)
1114
0
        return NULL;
1115
1116
    /* take ownership, give the object to the iterator */
1117
0
    it->str = Py_NewRef(self);
1118
1119
    /* initialize the contained MarkupIterator */
1120
0
    MarkupIterator_init(&it->it_markup, (PyObject*)self, 0, PyUnicode_GET_LENGTH(self));
1121
0
    return (PyObject *)it;
1122
0
}
1123
1124
1125
/************************************************************************/
1126
/*********** fieldnameiterator ******************************************/
1127
/************************************************************************/
1128
1129
1130
/* This is used to implement string.Formatter.vparse().  It parses the
1131
   field name into attribute and item values.  It's a Python-callable
1132
   wrapper around FieldNameIterator */
1133
1134
typedef struct {
1135
    PyObject_HEAD
1136
    PyObject *str;
1137
    FieldNameIterator it_field;
1138
} fieldnameiterobject;
1139
1140
static void
1141
fieldnameiter_dealloc(PyObject *op)
1142
0
{
1143
0
    fieldnameiterobject *it = (fieldnameiterobject*)op;
1144
0
    Py_XDECREF(it->str);
1145
0
    PyObject_Free(it);
1146
0
}
1147
1148
/* returns a tuple:
1149
   (is_attr, value)
1150
   is_attr is true if we used attribute syntax (e.g., '.foo')
1151
              false if we used index syntax (e.g., '[foo]')
1152
   value is an integer or string
1153
*/
1154
static PyObject *
1155
fieldnameiter_next(PyObject *op)
1156
0
{
1157
0
    fieldnameiterobject *it = (fieldnameiterobject*)op;
1158
0
    int result;
1159
0
    int is_attr;
1160
0
    Py_ssize_t idx;
1161
0
    SubString name;
1162
1163
0
    result = FieldNameIterator_next(&it->it_field, &is_attr,
1164
0
                                    &idx, &name);
1165
0
    if (result == 0 || result == 1)
1166
        /* if 0, error has already been set, if 1, iterator is empty */
1167
0
        return NULL;
1168
0
    else {
1169
0
        PyObject* result = NULL;
1170
0
        PyObject* is_attr_obj = NULL;
1171
0
        PyObject* obj = NULL;
1172
1173
0
        is_attr_obj = PyBool_FromLong(is_attr);
1174
0
        if (is_attr_obj == NULL)
1175
0
            goto done;
1176
1177
        /* either an integer or a string */
1178
0
        if (idx != -1)
1179
0
            obj = PyLong_FromSsize_t(idx);
1180
0
        else
1181
0
            obj = SubString_new_object(&name);
1182
0
        if (obj == NULL)
1183
0
            goto done;
1184
1185
        /* return a tuple of values */
1186
0
        result = PyTuple_Pack(2, is_attr_obj, obj);
1187
1188
0
    done:
1189
0
        Py_XDECREF(is_attr_obj);
1190
0
        Py_XDECREF(obj);
1191
0
        return result;
1192
0
    }
1193
0
}
1194
1195
static PyMethodDef fieldnameiter_methods[] = {
1196
    {NULL,              NULL}           /* sentinel */
1197
};
1198
1199
static PyTypeObject PyFieldNameIter_Type = {
1200
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
1201
    "fieldnameiterator",                /* tp_name */
1202
    sizeof(fieldnameiterobject),        /* tp_basicsize */
1203
    0,                                  /* tp_itemsize */
1204
    /* methods */
1205
    fieldnameiter_dealloc,              /* tp_dealloc */
1206
    0,                                  /* tp_vectorcall_offset */
1207
    0,                                  /* tp_getattr */
1208
    0,                                  /* tp_setattr */
1209
    0,                                  /* tp_as_async */
1210
    0,                                  /* tp_repr */
1211
    0,                                  /* tp_as_number */
1212
    0,                                  /* tp_as_sequence */
1213
    0,                                  /* tp_as_mapping */
1214
    0,                                  /* tp_hash */
1215
    0,                                  /* tp_call */
1216
    0,                                  /* tp_str */
1217
    PyObject_GenericGetAttr,            /* tp_getattro */
1218
    0,                                  /* tp_setattro */
1219
    0,                                  /* tp_as_buffer */
1220
    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1221
    0,                                  /* tp_doc */
1222
    0,                                  /* tp_traverse */
1223
    0,                                  /* tp_clear */
1224
    0,                                  /* tp_richcompare */
1225
    0,                                  /* tp_weaklistoffset */
1226
    PyObject_SelfIter,                  /* tp_iter */
1227
    fieldnameiter_next,                 /* tp_iternext */
1228
    fieldnameiter_methods,              /* tp_methods */
1229
    0};
1230
1231
/* unicode_formatter_field_name_split is used to implement
1232
   string.Formatter.vformat.  it takes a PEP 3101 "field name", and
1233
   returns a tuple of (first, rest): "first", the part before the
1234
   first '.' or '['; and "rest", an iterator for the rest of the field
1235
   name.  it's a wrapper around stringlib/string_format.h's
1236
   field_name_split.  The iterator it returns is a
1237
   FieldNameIterator */
1238
static PyObject *
1239
formatter_field_name_split(PyObject *Py_UNUSED(module), PyObject *self)
1240
0
{
1241
0
    SubString first;
1242
0
    Py_ssize_t first_idx;
1243
0
    fieldnameiterobject *it;
1244
1245
0
    PyObject *first_obj = NULL;
1246
0
    PyObject *result = NULL;
1247
1248
0
    if (!PyUnicode_Check(self)) {
1249
0
        PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1250
0
        return NULL;
1251
0
    }
1252
1253
0
    it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type);
1254
0
    if (it == NULL)
1255
0
        return NULL;
1256
1257
    /* take ownership, give the object to the iterator.  this is
1258
       just to keep the field_name alive */
1259
0
    it->str = Py_NewRef(self);
1260
1261
    /* Pass in auto_number = NULL. We'll return an empty string for
1262
       first_obj in that case. */
1263
0
    if (!field_name_split((PyObject*)self, 0, PyUnicode_GET_LENGTH(self),
1264
0
                          &first, &first_idx, &it->it_field, NULL))
1265
0
        goto done;
1266
1267
    /* first becomes an integer, if possible; else a string */
1268
0
    if (first_idx != -1)
1269
0
        first_obj = PyLong_FromSsize_t(first_idx);
1270
0
    else
1271
        /* convert "first" into a string object */
1272
0
        first_obj = SubString_new_object(&first);
1273
0
    if (first_obj == NULL)
1274
0
        goto done;
1275
1276
    /* return a tuple of values */
1277
0
    result = PyTuple_Pack(2, first_obj, it);
1278
1279
0
done:
1280
0
    Py_XDECREF(it);
1281
0
    Py_XDECREF(first_obj);
1282
0
    return result;
1283
0
}