Coverage Report

Created: 2025-07-18 06:09

/src/cpython/Objects/stringlib/unicode_format.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
    unicode_format.h -- implementation of str.format().
3
*/
4
5
#include "pycore_complexobject.h" // _PyComplex_FormatAdvancedWriter()
6
#include "pycore_floatobject.h"   // _PyFloat_FormatAdvancedWriter()
7
8
/************************************************************************/
9
/***********   Global data structures and forward declarations  *********/
10
/************************************************************************/
11
12
/*
13
   A SubString consists of the characters between two string or
14
   unicode pointers.
15
*/
16
typedef struct {
17
    PyObject *str; /* borrowed reference */
18
    Py_ssize_t start, end;
19
} SubString;
20
21
22
typedef enum {
23
    ANS_INIT,
24
    ANS_AUTO,
25
    ANS_MANUAL
26
} AutoNumberState;   /* Keep track if we're auto-numbering fields */
27
28
/* Keeps track of our auto-numbering state, and which number field we're on */
29
typedef struct {
30
    AutoNumberState an_state;
31
    int an_field_number;
32
} AutoNumber;
33
34
35
/* forward declaration for recursion */
36
static PyObject *
37
build_string(SubString *input, PyObject *args, PyObject *kwargs,
38
             int recursion_depth, AutoNumber *auto_number);
39
40
41
42
/************************************************************************/
43
/**************************  Utility  functions  ************************/
44
/************************************************************************/
45
46
static void
47
AutoNumber_Init(AutoNumber *auto_number)
48
9.37M
{
49
9.37M
    auto_number->an_state = ANS_INIT;
50
9.37M
    auto_number->an_field_number = 0;
51
9.37M
}
52
53
/* fill in a SubString from a pointer and length */
54
Py_LOCAL_INLINE(void)
55
SubString_init(SubString *str, PyObject *s, Py_ssize_t start, Py_ssize_t end)
56
178M
{
57
178M
    str->str = s;
58
178M
    str->start = start;
59
178M
    str->end = end;
60
178M
}
61
62
/* return a new string.  if str->str is NULL, return None */
63
Py_LOCAL_INLINE(PyObject *)
64
SubString_new_object(SubString *str)
65
42
{
66
42
    if (str->str == NULL)
67
0
        Py_RETURN_NONE;
68
42
    return PyUnicode_Substring(str->str, str->start, str->end);
69
42
}
70
71
/* return a new string.  if str->str is NULL, return a new empty string */
72
Py_LOCAL_INLINE(PyObject *)
73
SubString_new_object_or_empty(SubString *str)
74
0
{
75
0
    if (str->str == NULL) {
76
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_STR);
77
0
    }
78
0
    return SubString_new_object(str);
79
0
}
80
81
/* Return 1 if an error has been detected switching between automatic
82
   field numbering and manual field specification, else return 0. Set
83
   ValueError on error. */
84
static int
85
autonumber_state_error(AutoNumberState state, int field_name_is_empty)
86
17.5M
{
87
17.5M
    if (state == ANS_MANUAL) {
88
64
        if (field_name_is_empty) {
89
0
            PyErr_SetString(PyExc_ValueError, "cannot switch from "
90
0
                            "manual field specification to "
91
0
                            "automatic field numbering");
92
0
            return 1;
93
0
        }
94
64
    }
95
17.5M
    else {
96
17.5M
        if (!field_name_is_empty) {
97
0
            PyErr_SetString(PyExc_ValueError, "cannot switch from "
98
0
                            "automatic field numbering to "
99
0
                            "manual field specification");
100
0
            return 1;
101
0
        }
102
17.5M
    }
103
17.5M
    return 0;
104
17.5M
}
105
106
107
/************************************************************************/
108
/***********  Format string parsing -- integers and identifiers *********/
109
/************************************************************************/
110
111
static Py_ssize_t
112
get_integer(const SubString *str)
113
17.5M
{
114
17.5M
    Py_ssize_t accumulator = 0;
115
17.5M
    Py_ssize_t digitval;
116
17.5M
    Py_ssize_t i;
117
118
    /* empty string is an error */
119
17.5M
    if (str->start >= str->end)
120
17.5M
        return -1;
121
122
170
    for (i = str->start; i < str->end; i++) {
123
106
        digitval = Py_UNICODE_TODECIMAL(PyUnicode_READ_CHAR(str->str, i));
124
106
        if (digitval < 0)
125
42
            return -1;
126
        /*
127
           Detect possible overflow before it happens:
128
129
              accumulator * 10 + digitval > PY_SSIZE_T_MAX if and only if
130
              accumulator > (PY_SSIZE_T_MAX - digitval) / 10.
131
        */
132
64
        if (accumulator > (PY_SSIZE_T_MAX - digitval) / 10) {
133
0
            PyErr_Format(PyExc_ValueError,
134
0
                         "Too many decimal digits in format string");
135
0
            return -1;
136
0
        }
137
64
        accumulator = accumulator * 10 + digitval;
138
64
    }
139
64
    return accumulator;
140
106
}
141
142
/************************************************************************/
143
/******** Functions to get field objects and specification strings ******/
144
/************************************************************************/
145
146
/* do the equivalent of obj.name */
147
static PyObject *
148
getattr(PyObject *obj, SubString *name)
149
0
{
150
0
    PyObject *newobj;
151
0
    PyObject *str = SubString_new_object(name);
152
0
    if (str == NULL)
153
0
        return NULL;
154
0
    newobj = PyObject_GetAttr(obj, str);
155
0
    Py_DECREF(str);
156
0
    return newobj;
157
0
}
158
159
/* do the equivalent of obj[idx], where obj is a sequence */
160
static PyObject *
161
getitem_sequence(PyObject *obj, Py_ssize_t idx)
162
0
{
163
0
    return PySequence_GetItem(obj, idx);
164
0
}
165
166
/* do the equivalent of obj[idx], where obj is not a sequence */
167
static PyObject *
168
getitem_idx(PyObject *obj, Py_ssize_t idx)
169
0
{
170
0
    PyObject *newobj;
171
0
    PyObject *idx_obj = PyLong_FromSsize_t(idx);
172
0
    if (idx_obj == NULL)
173
0
        return NULL;
174
0
    newobj = PyObject_GetItem(obj, idx_obj);
175
0
    Py_DECREF(idx_obj);
176
0
    return newobj;
177
0
}
178
179
/* do the equivalent of obj[name] */
180
static PyObject *
181
getitem_str(PyObject *obj, SubString *name)
182
0
{
183
0
    PyObject *newobj;
184
0
    PyObject *str = SubString_new_object(name);
185
0
    if (str == NULL)
186
0
        return NULL;
187
0
    newobj = PyObject_GetItem(obj, str);
188
0
    Py_DECREF(str);
189
0
    return newobj;
190
0
}
191
192
typedef struct {
193
    /* the entire string we're parsing.  we assume that someone else
194
       is managing its lifetime, and that it will exist for the
195
       lifetime of the iterator.  can be empty */
196
    SubString str;
197
198
    /* index to where we are inside field_name */
199
    Py_ssize_t index;
200
} FieldNameIterator;
201
202
203
static int
204
FieldNameIterator_init(FieldNameIterator *self, PyObject *s,
205
                       Py_ssize_t start, Py_ssize_t end)
206
17.5M
{
207
17.5M
    SubString_init(&self->str, s, start, end);
208
17.5M
    self->index = start;
209
17.5M
    return 1;
210
17.5M
}
211
212
static int
213
_FieldNameIterator_attr(FieldNameIterator *self, SubString *name)
214
0
{
215
0
    Py_UCS4 c;
216
217
0
    name->str = self->str.str;
218
0
    name->start = self->index;
219
220
    /* return everything until '.' or '[' */
221
0
    while (self->index < self->str.end) {
222
0
        c = PyUnicode_READ_CHAR(self->str.str, self->index++);
223
0
        switch (c) {
224
0
        case '[':
225
0
        case '.':
226
            /* backup so that we this character will be seen next time */
227
0
            self->index--;
228
0
            break;
229
0
        default:
230
0
            continue;
231
0
        }
232
0
        break;
233
0
    }
234
    /* end of string is okay */
235
0
    name->end = self->index;
236
0
    return 1;
237
0
}
238
239
static int
240
_FieldNameIterator_item(FieldNameIterator *self, SubString *name)
241
0
{
242
0
    int bracket_seen = 0;
243
0
    Py_UCS4 c;
244
245
0
    name->str = self->str.str;
246
0
    name->start = self->index;
247
248
    /* return everything until ']' */
249
0
    while (self->index < self->str.end) {
250
0
        c = PyUnicode_READ_CHAR(self->str.str, self->index++);
251
0
        switch (c) {
252
0
        case ']':
253
0
            bracket_seen = 1;
254
0
            break;
255
0
        default:
256
0
            continue;
257
0
        }
258
0
        break;
259
0
    }
260
    /* make sure we ended with a ']' */
261
0
    if (!bracket_seen) {
262
0
        PyErr_SetString(PyExc_ValueError, "Missing ']' in format string");
263
0
        return 0;
264
0
    }
265
266
    /* end of string is okay */
267
    /* don't include the ']' */
268
0
    name->end = self->index-1;
269
0
    return 1;
270
0
}
271
272
/* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */
273
static int
274
FieldNameIterator_next(FieldNameIterator *self, int *is_attribute,
275
                       Py_ssize_t *name_idx, SubString *name)
276
17.5M
{
277
    /* check at end of input */
278
17.5M
    if (self->index >= self->str.end)
279
17.5M
        return 1;
280
281
0
    switch (PyUnicode_READ_CHAR(self->str.str, self->index++)) {
282
0
    case '.':
283
0
        *is_attribute = 1;
284
0
        if (_FieldNameIterator_attr(self, name) == 0)
285
0
            return 0;
286
0
        *name_idx = -1;
287
0
        break;
288
0
    case '[':
289
0
        *is_attribute = 0;
290
0
        if (_FieldNameIterator_item(self, name) == 0)
291
0
            return 0;
292
0
        *name_idx = get_integer(name);
293
0
        if (*name_idx == -1 && PyErr_Occurred())
294
0
            return 0;
295
0
        break;
296
0
    default:
297
        /* Invalid character follows ']' */
298
0
        PyErr_SetString(PyExc_ValueError, "Only '.' or '[' may "
299
0
                        "follow ']' in format field specifier");
300
0
        return 0;
301
0
    }
302
303
    /* empty string is an error */
304
0
    if (name->start == name->end) {
305
0
        PyErr_SetString(PyExc_ValueError, "Empty attribute in format string");
306
0
        return 0;
307
0
    }
308
309
0
    return 2;
310
0
}
311
312
313
/* input: field_name
314
   output: 'first' points to the part before the first '[' or '.'
315
           'first_idx' is -1 if 'first' is not an integer, otherwise
316
                       it's the value of first converted to an integer
317
           'rest' is an iterator to return the rest
318
*/
319
static int
320
field_name_split(PyObject *str, Py_ssize_t start, Py_ssize_t end, SubString *first,
321
                 Py_ssize_t *first_idx, FieldNameIterator *rest,
322
                 AutoNumber *auto_number)
323
17.5M
{
324
17.5M
    Py_UCS4 c;
325
17.5M
    Py_ssize_t i = start;
326
17.5M
    int field_name_is_empty;
327
17.5M
    int using_numeric_index;
328
329
    /* find the part up until the first '.' or '[' */
330
17.5M
    while (i < end) {
331
428
        switch (c = PyUnicode_READ_CHAR(str, i++)) {
332
0
        case '[':
333
0
        case '.':
334
            /* backup so that we this character is available to the
335
               "rest" iterator */
336
0
            i--;
337
0
            break;
338
428
        default:
339
428
            continue;
340
428
        }
341
0
        break;
342
428
    }
343
344
    /* set up the return values */
345
17.5M
    SubString_init(first, str, start, i);
346
17.5M
    FieldNameIterator_init(rest, str, i, end);
347
348
    /* see if "first" is an integer, in which case it's used as an index */
349
17.5M
    *first_idx = get_integer(first);
350
17.5M
    if (*first_idx == -1 && PyErr_Occurred())
351
0
        return 0;
352
353
17.5M
    field_name_is_empty = first->start >= first->end;
354
355
    /* If the field name is omitted or if we have a numeric index
356
       specified, then we're doing numeric indexing into args. */
357
17.5M
    using_numeric_index = field_name_is_empty || *first_idx != -1;
358
359
    /* We always get here exactly one time for each field we're
360
       processing. And we get here in field order (counting by left
361
       braces). So this is the perfect place to handle automatic field
362
       numbering if the field name is omitted. */
363
364
    /* Check if we need to do the auto-numbering. It's not needed if
365
       we're called from string.Format routines, because it's handled
366
       in that class by itself. */
367
17.5M
    if (auto_number) {
368
        /* Initialize our auto numbering state if this is the first
369
           time we're either auto-numbering or manually numbering. */
370
17.5M
        if (auto_number->an_state == ANS_INIT && using_numeric_index)
371
9.37M
            auto_number->an_state = field_name_is_empty ?
372
9.37M
                ANS_AUTO : ANS_MANUAL;
373
374
        /* Make sure our state is consistent with what we're doing
375
           this time through. Only check if we're using a numeric
376
           index. */
377
17.5M
        if (using_numeric_index)
378
17.5M
            if (autonumber_state_error(auto_number->an_state,
379
17.5M
                                       field_name_is_empty))
380
0
                return 0;
381
        /* Zero length field means we want to do auto-numbering of the
382
           fields. */
383
17.5M
        if (field_name_is_empty)
384
17.5M
            *first_idx = (auto_number->an_field_number)++;
385
17.5M
    }
386
387
17.5M
    return 1;
388
17.5M
}
389
390
391
/*
392
    get_field_object returns the object inside {}, before the
393
    format_spec.  It handles getindex and getattr lookups and consumes
394
    the entire input string.
395
*/
396
static PyObject *
397
get_field_object(SubString *input, PyObject *args, PyObject *kwargs,
398
                 AutoNumber *auto_number)
399
17.5M
{
400
17.5M
    PyObject *obj = NULL;
401
17.5M
    int ok;
402
17.5M
    int is_attribute;
403
17.5M
    SubString name;
404
17.5M
    SubString first;
405
17.5M
    Py_ssize_t index;
406
17.5M
    FieldNameIterator rest;
407
408
17.5M
    if (!field_name_split(input->str, input->start, input->end, &first,
409
17.5M
                          &index, &rest, auto_number)) {
410
0
        goto error;
411
0
    }
412
413
17.5M
    if (index == -1) {
414
        /* look up in kwargs */
415
42
        PyObject *key = SubString_new_object(&first);
416
42
        if (key == NULL) {
417
0
            goto error;
418
0
        }
419
42
        if (kwargs == NULL) {
420
0
            PyErr_SetObject(PyExc_KeyError, key);
421
0
            Py_DECREF(key);
422
0
            goto error;
423
0
        }
424
        /* Use PyObject_GetItem instead of PyDict_GetItem because this
425
           code is no longer just used with kwargs. It might be passed
426
           a non-dict when called through format_map. */
427
42
        obj = PyObject_GetItem(kwargs, key);
428
42
        Py_DECREF(key);
429
42
        if (obj == NULL) {
430
0
            goto error;
431
0
        }
432
42
    }
433
17.5M
    else {
434
        /* If args is NULL, we have a format string with a positional field
435
           with only kwargs to retrieve it from. This can only happen when
436
           used with format_map(), where positional arguments are not
437
           allowed. */
438
17.5M
        if (args == NULL) {
439
0
            PyErr_SetString(PyExc_ValueError, "Format string contains "
440
0
                            "positional fields");
441
0
            goto error;
442
0
        }
443
444
        /* look up in args */
445
17.5M
        obj = PySequence_GetItem(args, index);
446
17.5M
        if (obj == NULL) {
447
0
            PyErr_Format(PyExc_IndexError,
448
0
                         "Replacement index %zd out of range for positional "
449
0
                         "args tuple",
450
0
                         index);
451
0
             goto error;
452
0
        }
453
17.5M
    }
454
455
    /* iterate over the rest of the field_name */
456
17.5M
    while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index,
457
17.5M
                                        &name)) == 2) {
458
0
        PyObject *tmp;
459
460
0
        if (is_attribute)
461
            /* getattr lookup "." */
462
0
            tmp = getattr(obj, &name);
463
0
        else
464
            /* getitem lookup "[]" */
465
0
            if (index == -1)
466
0
                tmp = getitem_str(obj, &name);
467
0
            else
468
0
                if (PySequence_Check(obj))
469
0
                    tmp = getitem_sequence(obj, index);
470
0
                else
471
                    /* not a sequence */
472
0
                    tmp = getitem_idx(obj, index);
473
0
        if (tmp == NULL)
474
0
            goto error;
475
476
        /* assign to obj */
477
0
        Py_SETREF(obj, tmp);
478
0
    }
479
    /* end of iterator, this is the non-error case */
480
17.5M
    if (ok == 1)
481
17.5M
        return obj;
482
0
error:
483
0
    Py_XDECREF(obj);
484
0
    return NULL;
485
17.5M
}
486
487
/************************************************************************/
488
/*****************  Field rendering functions  **************************/
489
/************************************************************************/
490
491
/*
492
    render_field() is the main function in this section.  It takes the
493
    field object and field specification string generated by
494
    get_field_and_spec, and renders the field into the output string.
495
496
    render_field calls fieldobj.__format__(format_spec) method, and
497
    appends to the output.
498
*/
499
static int
500
render_field(PyObject *fieldobj, SubString *format_spec, _PyUnicodeWriter *writer)
501
17.5M
{
502
17.5M
    int ok = 0;
503
17.5M
    PyObject *result = NULL;
504
17.5M
    PyObject *format_spec_object = NULL;
505
17.5M
    int (*formatter) (_PyUnicodeWriter*, PyObject *, PyObject *, Py_ssize_t, Py_ssize_t) = NULL;
506
17.5M
    int err;
507
508
    /* If we know the type exactly, skip the lookup of __format__ and just
509
       call the formatter directly. */
510
17.5M
    if (PyUnicode_CheckExact(fieldobj))
511
17.2M
        formatter = _PyUnicode_FormatAdvancedWriter;
512
313k
    else if (PyLong_CheckExact(fieldobj))
513
72.8k
        formatter = _PyLong_FormatAdvancedWriter;
514
241k
    else if (PyFloat_CheckExact(fieldobj))
515
0
        formatter = _PyFloat_FormatAdvancedWriter;
516
241k
    else if (PyComplex_CheckExact(fieldobj))
517
0
        formatter = _PyComplex_FormatAdvancedWriter;
518
519
17.5M
    if (formatter) {
520
        /* we know exactly which formatter will be called when __format__ is
521
           looked up, so call it directly, instead. */
522
17.3M
        err = formatter(writer, fieldobj, format_spec->str,
523
17.3M
                        format_spec->start, format_spec->end);
524
17.3M
        return (err == 0);
525
17.3M
    }
526
241k
    else {
527
        /* We need to create an object out of the pointers we have, because
528
           __format__ takes a string/unicode object for format_spec. */
529
241k
        if (format_spec->str)
530
0
            format_spec_object = PyUnicode_Substring(format_spec->str,
531
0
                                                     format_spec->start,
532
0
                                                     format_spec->end);
533
241k
        else
534
241k
            format_spec_object = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
535
241k
        if (format_spec_object == NULL)
536
0
            goto done;
537
538
241k
        result = PyObject_Format(fieldobj, format_spec_object);
539
241k
    }
540
241k
    if (result == NULL)
541
1
        goto done;
542
543
241k
    if (_PyUnicodeWriter_WriteStr(writer, result) == -1)
544
0
        goto done;
545
241k
    ok = 1;
546
547
241k
done:
548
241k
    Py_XDECREF(format_spec_object);
549
241k
    Py_XDECREF(result);
550
241k
    return ok;
551
241k
}
552
553
static int
554
parse_field(SubString *str, SubString *field_name, SubString *format_spec,
555
            int *format_spec_needs_expanding, Py_UCS4 *conversion)
556
17.5M
{
557
    /* Note this function works if the field name is zero length,
558
       which is good.  Zero length field names are handled later, in
559
       field_name_split. */
560
561
17.5M
    Py_UCS4 c = 0;
562
563
    /* initialize these, as they may be empty */
564
17.5M
    *conversion = '\0';
565
17.5M
    SubString_init(format_spec, NULL, 0, 0);
566
567
    /* Search for the field name.  it's terminated by the end of
568
       the string, or a ':' or '!' */
569
17.5M
    field_name->str = str->str;
570
17.5M
    field_name->start = str->start;
571
17.5M
    while (str->start < str->end) {
572
17.5M
        switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
573
0
        case '{':
574
0
            PyErr_SetString(PyExc_ValueError, "unexpected '{' in field name");
575
0
            return 0;
576
0
        case '[':
577
0
            for (; str->start < str->end; str->start++)
578
0
                if (PyUnicode_READ_CHAR(str->str, str->start) == ']')
579
0
                    break;
580
0
            continue;
581
17.0M
        case '}':
582
17.0M
        case ':':
583
17.5M
        case '!':
584
17.5M
            break;
585
428
        default:
586
428
            continue;
587
17.5M
        }
588
17.5M
        break;
589
17.5M
    }
590
591
17.5M
    field_name->end = str->start - 1;
592
17.5M
    if (c == '!' || c == ':') {
593
574k
        Py_ssize_t count;
594
        /* we have a format specifier and/or a conversion */
595
        /* don't include the last character */
596
597
        /* see if there's a conversion specifier */
598
574k
        if (c == '!') {
599
            /* there must be another character present */
600
574k
            if (str->start >= str->end) {
601
0
                PyErr_SetString(PyExc_ValueError,
602
0
                                "end of string while looking for conversion "
603
0
                                "specifier");
604
0
                return 0;
605
0
            }
606
574k
            *conversion = PyUnicode_READ_CHAR(str->str, str->start++);
607
608
574k
            if (str->start < str->end) {
609
574k
                c = PyUnicode_READ_CHAR(str->str, str->start++);
610
574k
                if (c == '}')
611
574k
                    return 1;
612
0
                if (c != ':') {
613
0
                    PyErr_SetString(PyExc_ValueError,
614
0
                                    "expected ':' after conversion specifier");
615
0
                    return 0;
616
0
                }
617
0
            }
618
574k
        }
619
64
        format_spec->str = str->str;
620
64
        format_spec->start = str->start;
621
64
        count = 1;
622
256
        while (str->start < str->end) {
623
256
            switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
624
0
            case '{':
625
0
                *format_spec_needs_expanding = 1;
626
0
                count++;
627
0
                break;
628
64
            case '}':
629
64
                count--;
630
64
                if (count == 0) {
631
64
                    format_spec->end = str->start - 1;
632
64
                    return 1;
633
64
                }
634
0
                break;
635
192
            default:
636
192
                break;
637
256
            }
638
256
        }
639
640
0
        PyErr_SetString(PyExc_ValueError, "unmatched '{' in format spec");
641
0
        return 0;
642
64
    }
643
17.0M
    else if (c != '}') {
644
0
        PyErr_SetString(PyExc_ValueError, "expected '}' before end of string");
645
0
        return 0;
646
0
    }
647
648
17.0M
    return 1;
649
17.5M
}
650
651
/************************************************************************/
652
/******* Output string allocation and escape-to-markup processing  ******/
653
/************************************************************************/
654
655
/* MarkupIterator breaks the string into pieces of either literal
656
   text, or things inside {} that need to be marked up.  it is
657
   designed to make it easy to wrap a Python iterator around it, for
658
   use with the Formatter class */
659
660
typedef struct {
661
    SubString str;
662
} MarkupIterator;
663
664
static int
665
MarkupIterator_init(MarkupIterator *self, PyObject *str,
666
                    Py_ssize_t start, Py_ssize_t end)
667
9.37M
{
668
9.37M
    SubString_init(&self->str, str, start, end);
669
9.37M
    return 1;
670
9.37M
}
671
672
/* returns 0 on error, 1 on non-error termination, and 2 if it got a
673
   string (or something to be expanded) */
674
static int
675
MarkupIterator_next(MarkupIterator *self, SubString *literal,
676
                    int *field_present, SubString *field_name,
677
                    SubString *format_spec, Py_UCS4 *conversion,
678
                    int *format_spec_needs_expanding)
679
35.5M
{
680
35.5M
    int at_end;
681
35.5M
    Py_UCS4 c = 0;
682
35.5M
    Py_ssize_t start;
683
35.5M
    Py_ssize_t len;
684
35.5M
    int markup_follows = 0;
685
686
    /* initialize all of the output variables */
687
35.5M
    SubString_init(literal, NULL, 0, 0);
688
35.5M
    SubString_init(field_name, NULL, 0, 0);
689
35.5M
    SubString_init(format_spec, NULL, 0, 0);
690
35.5M
    *conversion = '\0';
691
35.5M
    *format_spec_needs_expanding = 0;
692
35.5M
    *field_present = 0;
693
694
    /* No more input, end of iterator.  This is the normal exit
695
       path. */
696
35.5M
    if (self->str.start >= self->str.end)
697
9.37M
        return 1;
698
699
26.1M
    start = self->str.start;
700
701
    /* First read any literal text. Read until the end of string, an
702
       escaped '{' or '}', or an unescaped '{'.  In order to never
703
       allocate memory and so I can just pass pointers around, if
704
       there's an escaped '{' or '}' then we'll return the literal
705
       including the brace, but no format object.  The next time
706
       through, we'll return the rest of the literal, skipping past
707
       the second consecutive brace. */
708
81.2M
    while (self->str.start < self->str.end) {
709
72.6M
        switch (c = PyUnicode_READ_CHAR(self->str.str, self->str.start++)) {
710
17.5M
        case '{':
711
17.5M
        case '}':
712
17.5M
            markup_follows = 1;
713
17.5M
            break;
714
55.0M
        default:
715
55.0M
            continue;
716
72.6M
        }
717
17.5M
        break;
718
72.6M
    }
719
720
26.1M
    at_end = self->str.start >= self->str.end;
721
26.1M
    len = self->str.start - start;
722
723
26.1M
    if ((c == '}') && (at_end ||
724
0
                       (c != PyUnicode_READ_CHAR(self->str.str,
725
0
                                                 self->str.start)))) {
726
0
        PyErr_SetString(PyExc_ValueError, "Single '}' encountered "
727
0
                        "in format string");
728
0
        return 0;
729
0
    }
730
26.1M
    if (at_end && c == '{') {
731
0
        PyErr_SetString(PyExc_ValueError, "Single '{' encountered "
732
0
                        "in format string");
733
0
        return 0;
734
0
    }
735
26.1M
    if (!at_end) {
736
17.5M
        if (c == PyUnicode_READ_CHAR(self->str.str, self->str.start)) {
737
            /* escaped } or {, skip it in the input.  there is no
738
               markup object following us, just this literal text */
739
0
            self->str.start++;
740
0
            markup_follows = 0;
741
0
        }
742
17.5M
        else
743
17.5M
            len--;
744
17.5M
    }
745
746
    /* record the literal text */
747
26.1M
    literal->str = self->str.str;
748
26.1M
    literal->start = start;
749
26.1M
    literal->end = start + len;
750
751
26.1M
    if (!markup_follows)
752
8.53M
        return 2;
753
754
    /* this is markup; parse the field */
755
17.5M
    *field_present = 1;
756
17.5M
    if (!parse_field(&self->str, field_name, format_spec,
757
17.5M
                     format_spec_needs_expanding, conversion))
758
0
        return 0;
759
17.5M
    return 2;
760
17.5M
}
761
762
763
/* do the !r or !s conversion on obj */
764
static PyObject *
765
do_conversion(PyObject *obj, Py_UCS4 conversion)
766
574k
{
767
    /* XXX in pre-3.0, do we need to convert this to unicode, since it
768
       might have returned a string? */
769
574k
    switch (conversion) {
770
574k
    case 'r':
771
574k
        return PyObject_Repr(obj);
772
0
    case 's':
773
0
        return PyObject_Str(obj);
774
0
    case 'a':
775
0
        return PyObject_ASCII(obj);
776
0
    default:
777
0
        if (conversion > 32 && conversion < 127) {
778
                /* It's the ASCII subrange; casting to char is safe
779
                   (assuming the execution character set is an ASCII
780
                   superset). */
781
0
                PyErr_Format(PyExc_ValueError,
782
0
                     "Unknown conversion specifier %c",
783
0
                     (char)conversion);
784
0
        } else
785
0
                PyErr_Format(PyExc_ValueError,
786
0
                     "Unknown conversion specifier \\x%x",
787
0
                     (unsigned int)conversion);
788
0
        return NULL;
789
574k
    }
790
574k
}
791
792
/* given:
793
794
   {field_name!conversion:format_spec}
795
796
   compute the result and write it to output.
797
   format_spec_needs_expanding is an optimization.  if it's false,
798
   just output the string directly, otherwise recursively expand the
799
   format_spec string.
800
801
   field_name is allowed to be zero length, in which case we
802
   are doing auto field numbering.
803
*/
804
805
static int
806
output_markup(SubString *field_name, SubString *format_spec,
807
              int format_spec_needs_expanding, Py_UCS4 conversion,
808
              _PyUnicodeWriter *writer, PyObject *args, PyObject *kwargs,
809
              int recursion_depth, AutoNumber *auto_number)
810
17.5M
{
811
17.5M
    PyObject *tmp = NULL;
812
17.5M
    PyObject *fieldobj = NULL;
813
17.5M
    SubString expanded_format_spec;
814
17.5M
    SubString *actual_format_spec;
815
17.5M
    int result = 0;
816
817
    /* convert field_name to an object */
818
17.5M
    fieldobj = get_field_object(field_name, args, kwargs, auto_number);
819
17.5M
    if (fieldobj == NULL)
820
0
        goto done;
821
822
17.5M
    if (conversion != '\0') {
823
574k
        tmp = do_conversion(fieldobj, conversion);
824
574k
        if (tmp == NULL)
825
0
            goto done;
826
827
        /* do the assignment, transferring ownership: fieldobj = tmp */
828
574k
        Py_SETREF(fieldobj, tmp);
829
574k
        tmp = NULL;
830
574k
    }
831
832
    /* if needed, recursively compute the format_spec */
833
17.5M
    if (format_spec_needs_expanding) {
834
0
        tmp = build_string(format_spec, args, kwargs, recursion_depth-1,
835
0
                           auto_number);
836
0
        if (tmp == NULL)
837
0
            goto done;
838
839
        /* note that in the case we're expanding the format string,
840
           tmp must be kept around until after the call to
841
           render_field. */
842
0
        SubString_init(&expanded_format_spec, tmp, 0, PyUnicode_GET_LENGTH(tmp));
843
0
        actual_format_spec = &expanded_format_spec;
844
0
    }
845
17.5M
    else
846
17.5M
        actual_format_spec = format_spec;
847
848
17.5M
    if (render_field(fieldobj, actual_format_spec, writer) == 0)
849
1
        goto done;
850
851
17.5M
    result = 1;
852
853
17.5M
done:
854
17.5M
    Py_XDECREF(fieldobj);
855
17.5M
    Py_XDECREF(tmp);
856
857
17.5M
    return result;
858
17.5M
}
859
860
/*
861
    do_markup is the top-level loop for the format() method.  It
862
    searches through the format string for escapes to markup codes, and
863
    calls other functions to move non-markup text to the output,
864
    and to perform the markup to the output.
865
*/
866
static int
867
do_markup(SubString *input, PyObject *args, PyObject *kwargs,
868
          _PyUnicodeWriter *writer, int recursion_depth, AutoNumber *auto_number)
869
9.37M
{
870
9.37M
    MarkupIterator iter;
871
9.37M
    int format_spec_needs_expanding;
872
9.37M
    int result;
873
9.37M
    int field_present;
874
9.37M
    SubString literal;
875
9.37M
    SubString field_name;
876
9.37M
    SubString format_spec;
877
9.37M
    Py_UCS4 conversion;
878
879
9.37M
    MarkupIterator_init(&iter, input->str, input->start, input->end);
880
35.5M
    while ((result = MarkupIterator_next(&iter, &literal, &field_present,
881
35.5M
                                         &field_name, &format_spec,
882
35.5M
                                         &conversion,
883
35.5M
                                         &format_spec_needs_expanding)) == 2) {
884
26.1M
        if (literal.end != literal.start) {
885
17.9M
            if (!field_present && iter.str.start == iter.str.end)
886
8.53M
                writer->overallocate = 0;
887
17.9M
            if (_PyUnicodeWriter_WriteSubstring(writer, literal.str,
888
17.9M
                                                literal.start, literal.end) < 0)
889
0
                return 0;
890
17.9M
        }
891
892
26.1M
        if (field_present) {
893
17.5M
            if (iter.str.start == iter.str.end)
894
848k
                writer->overallocate = 0;
895
17.5M
            if (!output_markup(&field_name, &format_spec,
896
17.5M
                               format_spec_needs_expanding, conversion, writer,
897
17.5M
                               args, kwargs, recursion_depth, auto_number))
898
1
                return 0;
899
17.5M
        }
900
26.1M
    }
901
9.37M
    return result;
902
9.37M
}
903
904
905
/*
906
    build_string allocates the output string and then
907
    calls do_markup to do the heavy lifting.
908
*/
909
static PyObject *
910
build_string(SubString *input, PyObject *args, PyObject *kwargs,
911
             int recursion_depth, AutoNumber *auto_number)
912
9.37M
{
913
9.37M
    _PyUnicodeWriter writer;
914
915
    /* check the recursion level */
916
9.37M
    if (recursion_depth <= 0) {
917
0
        PyErr_SetString(PyExc_ValueError,
918
0
                        "Max string recursion exceeded");
919
0
        return NULL;
920
0
    }
921
922
9.37M
    _PyUnicodeWriter_Init(&writer);
923
9.37M
    writer.overallocate = 1;
924
9.37M
    writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
925
926
9.37M
    if (!do_markup(input, args, kwargs, &writer, recursion_depth,
927
9.37M
                   auto_number)) {
928
1
        _PyUnicodeWriter_Dealloc(&writer);
929
1
        return NULL;
930
1
    }
931
932
9.37M
    return _PyUnicodeWriter_Finish(&writer);
933
9.37M
}
934
935
/************************************************************************/
936
/*********** main routine ***********************************************/
937
/************************************************************************/
938
939
/* this is the main entry point */
940
static PyObject *
941
do_string_format(PyObject *self, PyObject *args, PyObject *kwargs)
942
9.37M
{
943
9.37M
    SubString input;
944
945
    /* PEP 3101 says only 2 levels, so that
946
       "{0:{1}}".format('abc', 's')            # works
947
       "{0:{1:{2}}}".format('abc', 's', '')    # fails
948
    */
949
9.37M
    int recursion_depth = 2;
950
951
9.37M
    AutoNumber auto_number;
952
9.37M
    AutoNumber_Init(&auto_number);
953
9.37M
    SubString_init(&input, self, 0, PyUnicode_GET_LENGTH(self));
954
9.37M
    return build_string(&input, args, kwargs, recursion_depth, &auto_number);
955
9.37M
}
956
957
static PyObject *
958
do_string_format_map(PyObject *self, PyObject *obj)
959
0
{
960
0
    return do_string_format(self, NULL, obj);
961
0
}
962
963
964
/************************************************************************/
965
/*********** formatteriterator ******************************************/
966
/************************************************************************/
967
968
/* This is used to implement string.Formatter.vparse().  It exists so
969
   Formatter can share code with the built in unicode.format() method.
970
   It's really just a wrapper around MarkupIterator that is callable
971
   from Python. */
972
973
typedef struct {
974
    PyObject_HEAD
975
    PyObject *str;
976
    MarkupIterator it_markup;
977
} formatteriterobject;
978
979
static void
980
formatteriter_dealloc(PyObject *op)
981
0
{
982
0
    formatteriterobject *it = (formatteriterobject*)op;
983
0
    Py_XDECREF(it->str);
984
0
    PyObject_Free(it);
985
0
}
986
987
/* returns a tuple:
988
   (literal, field_name, format_spec, conversion)
989
990
   literal is any literal text to output.  might be zero length
991
   field_name is the string before the ':'.  might be None
992
   format_spec is the string after the ':'.  mibht be None
993
   conversion is either None, or the string after the '!'
994
*/
995
static PyObject *
996
formatteriter_next(PyObject *op)
997
0
{
998
0
    formatteriterobject *it = (formatteriterobject*)op;
999
0
    SubString literal;
1000
0
    SubString field_name;
1001
0
    SubString format_spec;
1002
0
    Py_UCS4 conversion;
1003
0
    int format_spec_needs_expanding;
1004
0
    int field_present;
1005
0
    int result = MarkupIterator_next(&it->it_markup, &literal, &field_present,
1006
0
                                     &field_name, &format_spec, &conversion,
1007
0
                                     &format_spec_needs_expanding);
1008
1009
    /* all of the SubString objects point into it->str, so no
1010
       memory management needs to be done on them */
1011
0
    assert(0 <= result && result <= 2);
1012
0
    if (result == 0 || result == 1)
1013
        /* if 0, error has already been set, if 1, iterator is empty */
1014
0
        return NULL;
1015
0
    else {
1016
0
        PyObject *literal_str = NULL;
1017
0
        PyObject *field_name_str = NULL;
1018
0
        PyObject *format_spec_str = NULL;
1019
0
        PyObject *conversion_str = NULL;
1020
0
        PyObject *tuple = NULL;
1021
1022
0
        literal_str = SubString_new_object(&literal);
1023
0
        if (literal_str == NULL)
1024
0
            goto done;
1025
1026
0
        field_name_str = SubString_new_object(&field_name);
1027
0
        if (field_name_str == NULL)
1028
0
            goto done;
1029
1030
        /* if field_name is non-zero length, return a string for
1031
           format_spec (even if zero length), else return None */
1032
0
        format_spec_str = (field_present ?
1033
0
                           SubString_new_object_or_empty :
1034
0
                           SubString_new_object)(&format_spec);
1035
0
        if (format_spec_str == NULL)
1036
0
            goto done;
1037
1038
        /* if the conversion is not specified, return a None,
1039
           otherwise create a one length string with the conversion
1040
           character */
1041
0
        if (conversion == '\0') {
1042
0
            conversion_str = Py_NewRef(Py_None);
1043
0
        }
1044
0
        else
1045
0
            conversion_str = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
1046
0
                                                       &conversion, 1);
1047
0
        if (conversion_str == NULL)
1048
0
            goto done;
1049
1050
0
        tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str,
1051
0
                             conversion_str);
1052
0
    done:
1053
0
        Py_XDECREF(literal_str);
1054
0
        Py_XDECREF(field_name_str);
1055
0
        Py_XDECREF(format_spec_str);
1056
0
        Py_XDECREF(conversion_str);
1057
0
        return tuple;
1058
0
    }
1059
0
}
1060
1061
static PyMethodDef formatteriter_methods[] = {
1062
    {NULL,              NULL}           /* sentinel */
1063
};
1064
1065
static PyTypeObject PyFormatterIter_Type = {
1066
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
1067
    "formatteriterator",                /* tp_name */
1068
    sizeof(formatteriterobject),        /* tp_basicsize */
1069
    0,                                  /* tp_itemsize */
1070
    /* methods */
1071
    formatteriter_dealloc,              /* tp_dealloc */
1072
    0,                                  /* tp_vectorcall_offset */
1073
    0,                                  /* tp_getattr */
1074
    0,                                  /* tp_setattr */
1075
    0,                                  /* tp_as_async */
1076
    0,                                  /* tp_repr */
1077
    0,                                  /* tp_as_number */
1078
    0,                                  /* tp_as_sequence */
1079
    0,                                  /* tp_as_mapping */
1080
    0,                                  /* tp_hash */
1081
    0,                                  /* tp_call */
1082
    0,                                  /* tp_str */
1083
    PyObject_GenericGetAttr,            /* tp_getattro */
1084
    0,                                  /* tp_setattro */
1085
    0,                                  /* tp_as_buffer */
1086
    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1087
    0,                                  /* tp_doc */
1088
    0,                                  /* tp_traverse */
1089
    0,                                  /* tp_clear */
1090
    0,                                  /* tp_richcompare */
1091
    0,                                  /* tp_weaklistoffset */
1092
    PyObject_SelfIter,                  /* tp_iter */
1093
    formatteriter_next,                 /* tp_iternext */
1094
    formatteriter_methods,              /* tp_methods */
1095
    0,
1096
};
1097
1098
/* unicode_formatter_parser is used to implement
1099
   string.Formatter.vformat.  it parses a string and returns tuples
1100
   describing the parsed elements.  It's a wrapper around
1101
   stringlib/string_format.h's MarkupIterator */
1102
static PyObject *
1103
formatter_parser(PyObject *Py_UNUSED(module), PyObject *self)
1104
0
{
1105
0
    formatteriterobject *it;
1106
1107
0
    if (!PyUnicode_Check(self)) {
1108
0
        PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1109
0
        return NULL;
1110
0
    }
1111
1112
0
    it = PyObject_New(formatteriterobject, &PyFormatterIter_Type);
1113
0
    if (it == NULL)
1114
0
        return NULL;
1115
1116
    /* take ownership, give the object to the iterator */
1117
0
    it->str = Py_NewRef(self);
1118
1119
    /* initialize the contained MarkupIterator */
1120
0
    MarkupIterator_init(&it->it_markup, (PyObject*)self, 0, PyUnicode_GET_LENGTH(self));
1121
0
    return (PyObject *)it;
1122
0
}
1123
1124
1125
/************************************************************************/
1126
/*********** fieldnameiterator ******************************************/
1127
/************************************************************************/
1128
1129
1130
/* This is used to implement string.Formatter.vparse().  It parses the
1131
   field name into attribute and item values.  It's a Python-callable
1132
   wrapper around FieldNameIterator */
1133
1134
typedef struct {
1135
    PyObject_HEAD
1136
    PyObject *str;
1137
    FieldNameIterator it_field;
1138
} fieldnameiterobject;
1139
1140
static void
1141
fieldnameiter_dealloc(PyObject *op)
1142
0
{
1143
0
    fieldnameiterobject *it = (fieldnameiterobject*)op;
1144
0
    Py_XDECREF(it->str);
1145
0
    PyObject_Free(it);
1146
0
}
1147
1148
/* returns a tuple:
1149
   (is_attr, value)
1150
   is_attr is true if we used attribute syntax (e.g., '.foo')
1151
              false if we used index syntax (e.g., '[foo]')
1152
   value is an integer or string
1153
*/
1154
static PyObject *
1155
fieldnameiter_next(PyObject *op)
1156
0
{
1157
0
    fieldnameiterobject *it = (fieldnameiterobject*)op;
1158
0
    int result;
1159
0
    int is_attr;
1160
0
    Py_ssize_t idx;
1161
0
    SubString name;
1162
1163
0
    result = FieldNameIterator_next(&it->it_field, &is_attr,
1164
0
                                    &idx, &name);
1165
0
    if (result == 0 || result == 1)
1166
        /* if 0, error has already been set, if 1, iterator is empty */
1167
0
        return NULL;
1168
0
    else {
1169
0
        PyObject* result = NULL;
1170
0
        PyObject* is_attr_obj = NULL;
1171
0
        PyObject* obj = NULL;
1172
1173
0
        is_attr_obj = PyBool_FromLong(is_attr);
1174
0
        if (is_attr_obj == NULL)
1175
0
            goto done;
1176
1177
        /* either an integer or a string */
1178
0
        if (idx != -1)
1179
0
            obj = PyLong_FromSsize_t(idx);
1180
0
        else
1181
0
            obj = SubString_new_object(&name);
1182
0
        if (obj == NULL)
1183
0
            goto done;
1184
1185
        /* return a tuple of values */
1186
0
        result = PyTuple_Pack(2, is_attr_obj, obj);
1187
1188
0
    done:
1189
0
        Py_XDECREF(is_attr_obj);
1190
0
        Py_XDECREF(obj);
1191
0
        return result;
1192
0
    }
1193
0
}
1194
1195
static PyMethodDef fieldnameiter_methods[] = {
1196
    {NULL,              NULL}           /* sentinel */
1197
};
1198
1199
static PyTypeObject PyFieldNameIter_Type = {
1200
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
1201
    "fieldnameiterator",                /* tp_name */
1202
    sizeof(fieldnameiterobject),        /* tp_basicsize */
1203
    0,                                  /* tp_itemsize */
1204
    /* methods */
1205
    fieldnameiter_dealloc,              /* tp_dealloc */
1206
    0,                                  /* tp_vectorcall_offset */
1207
    0,                                  /* tp_getattr */
1208
    0,                                  /* tp_setattr */
1209
    0,                                  /* tp_as_async */
1210
    0,                                  /* tp_repr */
1211
    0,                                  /* tp_as_number */
1212
    0,                                  /* tp_as_sequence */
1213
    0,                                  /* tp_as_mapping */
1214
    0,                                  /* tp_hash */
1215
    0,                                  /* tp_call */
1216
    0,                                  /* tp_str */
1217
    PyObject_GenericGetAttr,            /* tp_getattro */
1218
    0,                                  /* tp_setattro */
1219
    0,                                  /* tp_as_buffer */
1220
    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1221
    0,                                  /* tp_doc */
1222
    0,                                  /* tp_traverse */
1223
    0,                                  /* tp_clear */
1224
    0,                                  /* tp_richcompare */
1225
    0,                                  /* tp_weaklistoffset */
1226
    PyObject_SelfIter,                  /* tp_iter */
1227
    fieldnameiter_next,                 /* tp_iternext */
1228
    fieldnameiter_methods,              /* tp_methods */
1229
    0};
1230
1231
/* unicode_formatter_field_name_split is used to implement
1232
   string.Formatter.vformat.  it takes a PEP 3101 "field name", and
1233
   returns a tuple of (first, rest): "first", the part before the
1234
   first '.' or '['; and "rest", an iterator for the rest of the field
1235
   name.  it's a wrapper around stringlib/string_format.h's
1236
   field_name_split.  The iterator it returns is a
1237
   FieldNameIterator */
1238
static PyObject *
1239
formatter_field_name_split(PyObject *Py_UNUSED(module), PyObject *self)
1240
0
{
1241
0
    SubString first;
1242
0
    Py_ssize_t first_idx;
1243
0
    fieldnameiterobject *it;
1244
1245
0
    PyObject *first_obj = NULL;
1246
0
    PyObject *result = NULL;
1247
1248
0
    if (!PyUnicode_Check(self)) {
1249
0
        PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1250
0
        return NULL;
1251
0
    }
1252
1253
0
    it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type);
1254
0
    if (it == NULL)
1255
0
        return NULL;
1256
1257
    /* take ownership, give the object to the iterator.  this is
1258
       just to keep the field_name alive */
1259
0
    it->str = Py_NewRef(self);
1260
1261
    /* Pass in auto_number = NULL. We'll return an empty string for
1262
       first_obj in that case. */
1263
0
    if (!field_name_split((PyObject*)self, 0, PyUnicode_GET_LENGTH(self),
1264
0
                          &first, &first_idx, &it->it_field, NULL))
1265
0
        goto done;
1266
1267
    /* first becomes an integer, if possible; else a string */
1268
0
    if (first_idx != -1)
1269
0
        first_obj = PyLong_FromSsize_t(first_idx);
1270
0
    else
1271
        /* convert "first" into a string object */
1272
0
        first_obj = SubString_new_object(&first);
1273
0
    if (first_obj == NULL)
1274
0
        goto done;
1275
1276
    /* return a tuple of values */
1277
0
    result = PyTuple_Pack(2, first_obj, it);
1278
1279
0
done:
1280
0
    Py_XDECREF(it);
1281
0
    Py_XDECREF(first_obj);
1282
0
    return result;
1283
0
}