Coverage Report

Created: 2025-07-04 06:49

/src/cpython/Objects/stringlib/unicode_format.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
    unicode_format.h -- implementation of str.format().
3
*/
4
5
#include "pycore_complexobject.h" // _PyComplex_FormatAdvancedWriter()
6
#include "pycore_floatobject.h"   // _PyFloat_FormatAdvancedWriter()
7
8
/************************************************************************/
9
/***********   Global data structures and forward declarations  *********/
10
/************************************************************************/
11
12
/*
13
   A SubString consists of the characters between two string or
14
   unicode pointers.
15
*/
16
typedef struct {
17
    PyObject *str; /* borrowed reference */
18
    Py_ssize_t start, end;
19
} SubString;
20
21
22
typedef enum {
23
    ANS_INIT,
24
    ANS_AUTO,
25
    ANS_MANUAL
26
} AutoNumberState;   /* Keep track if we're auto-numbering fields */
27
28
/* Keeps track of our auto-numbering state, and which number field we're on */
29
typedef struct {
30
    AutoNumberState an_state;
31
    int an_field_number;
32
} AutoNumber;
33
34
35
/* forward declaration for recursion */
36
static PyObject *
37
build_string(SubString *input, PyObject *args, PyObject *kwargs,
38
             int recursion_depth, AutoNumber *auto_number);
39
40
41
42
/************************************************************************/
43
/**************************  Utility  functions  ************************/
44
/************************************************************************/
45
46
static void
47
AutoNumber_Init(AutoNumber *auto_number)
48
8.79M
{
49
8.79M
    auto_number->an_state = ANS_INIT;
50
8.79M
    auto_number->an_field_number = 0;
51
8.79M
}
52
53
/* fill in a SubString from a pointer and length */
54
Py_LOCAL_INLINE(void)
55
SubString_init(SubString *str, PyObject *s, Py_ssize_t start, Py_ssize_t end)
56
166M
{
57
166M
    str->str = s;
58
166M
    str->start = start;
59
166M
    str->end = end;
60
166M
}
61
62
/* return a new string.  if str->str is NULL, return None */
63
Py_LOCAL_INLINE(PyObject *)
64
SubString_new_object(SubString *str)
65
42
{
66
42
    if (str->str == NULL)
67
0
        Py_RETURN_NONE;
68
42
    return PyUnicode_Substring(str->str, str->start, str->end);
69
42
}
70
71
/* return a new string.  if str->str is NULL, return a new empty string */
72
Py_LOCAL_INLINE(PyObject *)
73
SubString_new_object_or_empty(SubString *str)
74
0
{
75
0
    if (str->str == NULL) {
76
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_STR);
77
0
    }
78
0
    return SubString_new_object(str);
79
0
}
80
81
/* Return 1 if an error has been detected switching between automatic
82
   field numbering and manual field specification, else return 0. Set
83
   ValueError on error. */
84
static int
85
autonumber_state_error(AutoNumberState state, int field_name_is_empty)
86
16.4M
{
87
16.4M
    if (state == ANS_MANUAL) {
88
64
        if (field_name_is_empty) {
89
0
            PyErr_SetString(PyExc_ValueError, "cannot switch from "
90
0
                            "manual field specification to "
91
0
                            "automatic field numbering");
92
0
            return 1;
93
0
        }
94
64
    }
95
16.4M
    else {
96
16.4M
        if (!field_name_is_empty) {
97
0
            PyErr_SetString(PyExc_ValueError, "cannot switch from "
98
0
                            "automatic field numbering to "
99
0
                            "manual field specification");
100
0
            return 1;
101
0
        }
102
16.4M
    }
103
16.4M
    return 0;
104
16.4M
}
105
106
107
/************************************************************************/
108
/***********  Format string parsing -- integers and identifiers *********/
109
/************************************************************************/
110
111
static Py_ssize_t
112
get_integer(const SubString *str)
113
16.4M
{
114
16.4M
    Py_ssize_t accumulator = 0;
115
16.4M
    Py_ssize_t digitval;
116
16.4M
    Py_ssize_t i;
117
118
    /* empty string is an error */
119
16.4M
    if (str->start >= str->end)
120
16.4M
        return -1;
121
122
170
    for (i = str->start; i < str->end; i++) {
123
106
        digitval = Py_UNICODE_TODECIMAL(PyUnicode_READ_CHAR(str->str, i));
124
106
        if (digitval < 0)
125
42
            return -1;
126
        /*
127
           Detect possible overflow before it happens:
128
129
              accumulator * 10 + digitval > PY_SSIZE_T_MAX if and only if
130
              accumulator > (PY_SSIZE_T_MAX - digitval) / 10.
131
        */
132
64
        if (accumulator > (PY_SSIZE_T_MAX - digitval) / 10) {
133
0
            PyErr_Format(PyExc_ValueError,
134
0
                         "Too many decimal digits in format string");
135
0
            return -1;
136
0
        }
137
64
        accumulator = accumulator * 10 + digitval;
138
64
    }
139
64
    return accumulator;
140
106
}
141
142
/************************************************************************/
143
/******** Functions to get field objects and specification strings ******/
144
/************************************************************************/
145
146
/* do the equivalent of obj.name */
147
static PyObject *
148
getattr(PyObject *obj, SubString *name)
149
0
{
150
0
    PyObject *newobj;
151
0
    PyObject *str = SubString_new_object(name);
152
0
    if (str == NULL)
153
0
        return NULL;
154
0
    newobj = PyObject_GetAttr(obj, str);
155
0
    Py_DECREF(str);
156
0
    return newobj;
157
0
}
158
159
/* do the equivalent of obj[idx], where obj is a sequence */
160
static PyObject *
161
getitem_sequence(PyObject *obj, Py_ssize_t idx)
162
0
{
163
0
    return PySequence_GetItem(obj, idx);
164
0
}
165
166
/* do the equivalent of obj[idx], where obj is not a sequence */
167
static PyObject *
168
getitem_idx(PyObject *obj, Py_ssize_t idx)
169
0
{
170
0
    PyObject *newobj;
171
0
    PyObject *idx_obj = PyLong_FromSsize_t(idx);
172
0
    if (idx_obj == NULL)
173
0
        return NULL;
174
0
    newobj = PyObject_GetItem(obj, idx_obj);
175
0
    Py_DECREF(idx_obj);
176
0
    return newobj;
177
0
}
178
179
/* do the equivalent of obj[name] */
180
static PyObject *
181
getitem_str(PyObject *obj, SubString *name)
182
0
{
183
0
    PyObject *newobj;
184
0
    PyObject *str = SubString_new_object(name);
185
0
    if (str == NULL)
186
0
        return NULL;
187
0
    newobj = PyObject_GetItem(obj, str);
188
0
    Py_DECREF(str);
189
0
    return newobj;
190
0
}
191
192
typedef struct {
193
    /* the entire string we're parsing.  we assume that someone else
194
       is managing its lifetime, and that it will exist for the
195
       lifetime of the iterator.  can be empty */
196
    SubString str;
197
198
    /* index to where we are inside field_name */
199
    Py_ssize_t index;
200
} FieldNameIterator;
201
202
203
static int
204
FieldNameIterator_init(FieldNameIterator *self, PyObject *s,
205
                       Py_ssize_t start, Py_ssize_t end)
206
16.4M
{
207
16.4M
    SubString_init(&self->str, s, start, end);
208
16.4M
    self->index = start;
209
16.4M
    return 1;
210
16.4M
}
211
212
static int
213
_FieldNameIterator_attr(FieldNameIterator *self, SubString *name)
214
0
{
215
0
    Py_UCS4 c;
216
217
0
    name->str = self->str.str;
218
0
    name->start = self->index;
219
220
    /* return everything until '.' or '[' */
221
0
    while (self->index < self->str.end) {
222
0
        c = PyUnicode_READ_CHAR(self->str.str, self->index++);
223
0
        switch (c) {
224
0
        case '[':
225
0
        case '.':
226
            /* backup so that we this character will be seen next time */
227
0
            self->index--;
228
0
            break;
229
0
        default:
230
0
            continue;
231
0
        }
232
0
        break;
233
0
    }
234
    /* end of string is okay */
235
0
    name->end = self->index;
236
0
    return 1;
237
0
}
238
239
static int
240
_FieldNameIterator_item(FieldNameIterator *self, SubString *name)
241
0
{
242
0
    int bracket_seen = 0;
243
0
    Py_UCS4 c;
244
245
0
    name->str = self->str.str;
246
0
    name->start = self->index;
247
248
    /* return everything until ']' */
249
0
    while (self->index < self->str.end) {
250
0
        c = PyUnicode_READ_CHAR(self->str.str, self->index++);
251
0
        switch (c) {
252
0
        case ']':
253
0
            bracket_seen = 1;
254
0
            break;
255
0
        default:
256
0
            continue;
257
0
        }
258
0
        break;
259
0
    }
260
    /* make sure we ended with a ']' */
261
0
    if (!bracket_seen) {
262
0
        PyErr_SetString(PyExc_ValueError, "Missing ']' in format string");
263
0
        return 0;
264
0
    }
265
266
    /* end of string is okay */
267
    /* don't include the ']' */
268
0
    name->end = self->index-1;
269
0
    return 1;
270
0
}
271
272
/* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */
273
static int
274
FieldNameIterator_next(FieldNameIterator *self, int *is_attribute,
275
                       Py_ssize_t *name_idx, SubString *name)
276
16.4M
{
277
    /* check at end of input */
278
16.4M
    if (self->index >= self->str.end)
279
16.4M
        return 1;
280
281
0
    switch (PyUnicode_READ_CHAR(self->str.str, self->index++)) {
282
0
    case '.':
283
0
        *is_attribute = 1;
284
0
        if (_FieldNameIterator_attr(self, name) == 0)
285
0
            return 0;
286
0
        *name_idx = -1;
287
0
        break;
288
0
    case '[':
289
0
        *is_attribute = 0;
290
0
        if (_FieldNameIterator_item(self, name) == 0)
291
0
            return 0;
292
0
        *name_idx = get_integer(name);
293
0
        if (*name_idx == -1 && PyErr_Occurred())
294
0
            return 0;
295
0
        break;
296
0
    default:
297
        /* Invalid character follows ']' */
298
0
        PyErr_SetString(PyExc_ValueError, "Only '.' or '[' may "
299
0
                        "follow ']' in format field specifier");
300
0
        return 0;
301
0
    }
302
303
    /* empty string is an error */
304
0
    if (name->start == name->end) {
305
0
        PyErr_SetString(PyExc_ValueError, "Empty attribute in format string");
306
0
        return 0;
307
0
    }
308
309
0
    return 2;
310
0
}
311
312
313
/* input: field_name
314
   output: 'first' points to the part before the first '[' or '.'
315
           'first_idx' is -1 if 'first' is not an integer, otherwise
316
                       it's the value of first converted to an integer
317
           'rest' is an iterator to return the rest
318
*/
319
static int
320
field_name_split(PyObject *str, Py_ssize_t start, Py_ssize_t end, SubString *first,
321
                 Py_ssize_t *first_idx, FieldNameIterator *rest,
322
                 AutoNumber *auto_number)
323
16.4M
{
324
16.4M
    Py_UCS4 c;
325
16.4M
    Py_ssize_t i = start;
326
16.4M
    int field_name_is_empty;
327
16.4M
    int using_numeric_index;
328
329
    /* find the part up until the first '.' or '[' */
330
16.4M
    while (i < end) {
331
428
        switch (c = PyUnicode_READ_CHAR(str, i++)) {
332
0
        case '[':
333
0
        case '.':
334
            /* backup so that we this character is available to the
335
               "rest" iterator */
336
0
            i--;
337
0
            break;
338
428
        default:
339
428
            continue;
340
428
        }
341
0
        break;
342
428
    }
343
344
    /* set up the return values */
345
16.4M
    SubString_init(first, str, start, i);
346
16.4M
    FieldNameIterator_init(rest, str, i, end);
347
348
    /* see if "first" is an integer, in which case it's used as an index */
349
16.4M
    *first_idx = get_integer(first);
350
16.4M
    if (*first_idx == -1 && PyErr_Occurred())
351
0
        return 0;
352
353
16.4M
    field_name_is_empty = first->start >= first->end;
354
355
    /* If the field name is omitted or if we have a numeric index
356
       specified, then we're doing numeric indexing into args. */
357
16.4M
    using_numeric_index = field_name_is_empty || *first_idx != -1;
358
359
    /* We always get here exactly one time for each field we're
360
       processing. And we get here in field order (counting by left
361
       braces). So this is the perfect place to handle automatic field
362
       numbering if the field name is omitted. */
363
364
    /* Check if we need to do the auto-numbering. It's not needed if
365
       we're called from string.Format routines, because it's handled
366
       in that class by itself. */
367
16.4M
    if (auto_number) {
368
        /* Initialize our auto numbering state if this is the first
369
           time we're either auto-numbering or manually numbering. */
370
16.4M
        if (auto_number->an_state == ANS_INIT && using_numeric_index)
371
8.79M
            auto_number->an_state = field_name_is_empty ?
372
8.79M
                ANS_AUTO : ANS_MANUAL;
373
374
        /* Make sure our state is consistent with what we're doing
375
           this time through. Only check if we're using a numeric
376
           index. */
377
16.4M
        if (using_numeric_index)
378
16.4M
            if (autonumber_state_error(auto_number->an_state,
379
16.4M
                                       field_name_is_empty))
380
0
                return 0;
381
        /* Zero length field means we want to do auto-numbering of the
382
           fields. */
383
16.4M
        if (field_name_is_empty)
384
16.4M
            *first_idx = (auto_number->an_field_number)++;
385
16.4M
    }
386
387
16.4M
    return 1;
388
16.4M
}
389
390
391
/*
392
    get_field_object returns the object inside {}, before the
393
    format_spec.  It handles getindex and getattr lookups and consumes
394
    the entire input string.
395
*/
396
static PyObject *
397
get_field_object(SubString *input, PyObject *args, PyObject *kwargs,
398
                 AutoNumber *auto_number)
399
16.4M
{
400
16.4M
    PyObject *obj = NULL;
401
16.4M
    int ok;
402
16.4M
    int is_attribute;
403
16.4M
    SubString name;
404
16.4M
    SubString first;
405
16.4M
    Py_ssize_t index;
406
16.4M
    FieldNameIterator rest;
407
408
16.4M
    if (!field_name_split(input->str, input->start, input->end, &first,
409
16.4M
                          &index, &rest, auto_number)) {
410
0
        goto error;
411
0
    }
412
413
16.4M
    if (index == -1) {
414
        /* look up in kwargs */
415
42
        PyObject *key = SubString_new_object(&first);
416
42
        if (key == NULL) {
417
0
            goto error;
418
0
        }
419
42
        if (kwargs == NULL) {
420
0
            PyErr_SetObject(PyExc_KeyError, key);
421
0
            Py_DECREF(key);
422
0
            goto error;
423
0
        }
424
        /* Use PyObject_GetItem instead of PyDict_GetItem because this
425
           code is no longer just used with kwargs. It might be passed
426
           a non-dict when called through format_map. */
427
42
        obj = PyObject_GetItem(kwargs, key);
428
42
        Py_DECREF(key);
429
42
        if (obj == NULL) {
430
0
            goto error;
431
0
        }
432
42
    }
433
16.4M
    else {
434
        /* If args is NULL, we have a format string with a positional field
435
           with only kwargs to retrieve it from. This can only happen when
436
           used with format_map(), where positional arguments are not
437
           allowed. */
438
16.4M
        if (args == NULL) {
439
0
            PyErr_SetString(PyExc_ValueError, "Format string contains "
440
0
                            "positional fields");
441
0
            goto error;
442
0
        }
443
444
        /* look up in args */
445
16.4M
        obj = PySequence_GetItem(args, index);
446
16.4M
        if (obj == NULL) {
447
0
            PyErr_Format(PyExc_IndexError,
448
0
                         "Replacement index %zd out of range for positional "
449
0
                         "args tuple",
450
0
                         index);
451
0
             goto error;
452
0
        }
453
16.4M
    }
454
455
    /* iterate over the rest of the field_name */
456
16.4M
    while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index,
457
16.4M
                                        &name)) == 2) {
458
0
        PyObject *tmp;
459
460
0
        if (is_attribute)
461
            /* getattr lookup "." */
462
0
            tmp = getattr(obj, &name);
463
0
        else
464
            /* getitem lookup "[]" */
465
0
            if (index == -1)
466
0
                tmp = getitem_str(obj, &name);
467
0
            else
468
0
                if (PySequence_Check(obj))
469
0
                    tmp = getitem_sequence(obj, index);
470
0
                else
471
                    /* not a sequence */
472
0
                    tmp = getitem_idx(obj, index);
473
0
        if (tmp == NULL)
474
0
            goto error;
475
476
        /* assign to obj */
477
0
        Py_SETREF(obj, tmp);
478
0
    }
479
    /* end of iterator, this is the non-error case */
480
16.4M
    if (ok == 1)
481
16.4M
        return obj;
482
0
error:
483
0
    Py_XDECREF(obj);
484
0
    return NULL;
485
16.4M
}
486
487
/************************************************************************/
488
/*****************  Field rendering functions  **************************/
489
/************************************************************************/
490
491
/*
492
    render_field() is the main function in this section.  It takes the
493
    field object and field specification string generated by
494
    get_field_and_spec, and renders the field into the output string.
495
496
    render_field calls fieldobj.__format__(format_spec) method, and
497
    appends to the output.
498
*/
499
static int
500
render_field(PyObject *fieldobj, SubString *format_spec, _PyUnicodeWriter *writer)
501
16.4M
{
502
16.4M
    int ok = 0;
503
16.4M
    PyObject *result = NULL;
504
16.4M
    PyObject *format_spec_object = NULL;
505
16.4M
    int (*formatter) (_PyUnicodeWriter*, PyObject *, PyObject *, Py_ssize_t, Py_ssize_t) = NULL;
506
16.4M
    int err;
507
508
    /* If we know the type exactly, skip the lookup of __format__ and just
509
       call the formatter directly. */
510
16.4M
    if (PyUnicode_CheckExact(fieldobj))
511
16.2M
        formatter = _PyUnicode_FormatAdvancedWriter;
512
288k
    else if (PyLong_CheckExact(fieldobj))
513
79.3k
        formatter = _PyLong_FormatAdvancedWriter;
514
208k
    else if (PyFloat_CheckExact(fieldobj))
515
0
        formatter = _PyFloat_FormatAdvancedWriter;
516
208k
    else if (PyComplex_CheckExact(fieldobj))
517
0
        formatter = _PyComplex_FormatAdvancedWriter;
518
519
16.4M
    if (formatter) {
520
        /* we know exactly which formatter will be called when __format__ is
521
           looked up, so call it directly, instead. */
522
16.2M
        err = formatter(writer, fieldobj, format_spec->str,
523
16.2M
                        format_spec->start, format_spec->end);
524
16.2M
        return (err == 0);
525
16.2M
    }
526
208k
    else {
527
        /* We need to create an object out of the pointers we have, because
528
           __format__ takes a string/unicode object for format_spec. */
529
208k
        if (format_spec->str)
530
0
            format_spec_object = PyUnicode_Substring(format_spec->str,
531
0
                                                     format_spec->start,
532
0
                                                     format_spec->end);
533
208k
        else
534
208k
            format_spec_object = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
535
208k
        if (format_spec_object == NULL)
536
0
            goto done;
537
538
208k
        result = PyObject_Format(fieldobj, format_spec_object);
539
208k
    }
540
208k
    if (result == NULL)
541
2
        goto done;
542
543
208k
    if (_PyUnicodeWriter_WriteStr(writer, result) == -1)
544
0
        goto done;
545
208k
    ok = 1;
546
547
208k
done:
548
208k
    Py_XDECREF(format_spec_object);
549
208k
    Py_XDECREF(result);
550
208k
    return ok;
551
208k
}
552
553
static int
554
parse_field(SubString *str, SubString *field_name, SubString *format_spec,
555
            int *format_spec_needs_expanding, Py_UCS4 *conversion)
556
16.4M
{
557
    /* Note this function works if the field name is zero length,
558
       which is good.  Zero length field names are handled later, in
559
       field_name_split. */
560
561
16.4M
    Py_UCS4 c = 0;
562
563
    /* initialize these, as they may be empty */
564
16.4M
    *conversion = '\0';
565
16.4M
    SubString_init(format_spec, NULL, 0, 0);
566
567
    /* Search for the field name.  it's terminated by the end of
568
       the string, or a ':' or '!' */
569
16.4M
    field_name->str = str->str;
570
16.4M
    field_name->start = str->start;
571
16.4M
    while (str->start < str->end) {
572
16.4M
        switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
573
0
        case '{':
574
0
            PyErr_SetString(PyExc_ValueError, "unexpected '{' in field name");
575
0
            return 0;
576
0
        case '[':
577
0
            for (; str->start < str->end; str->start++)
578
0
                if (PyUnicode_READ_CHAR(str->str, str->start) == ']')
579
0
                    break;
580
0
            continue;
581
15.9M
        case '}':
582
15.9M
        case ':':
583
16.4M
        case '!':
584
16.4M
            break;
585
428
        default:
586
428
            continue;
587
16.4M
        }
588
16.4M
        break;
589
16.4M
    }
590
591
16.4M
    field_name->end = str->start - 1;
592
16.4M
    if (c == '!' || c == ':') {
593
552k
        Py_ssize_t count;
594
        /* we have a format specifier and/or a conversion */
595
        /* don't include the last character */
596
597
        /* see if there's a conversion specifier */
598
552k
        if (c == '!') {
599
            /* there must be another character present */
600
552k
            if (str->start >= str->end) {
601
0
                PyErr_SetString(PyExc_ValueError,
602
0
                                "end of string while looking for conversion "
603
0
                                "specifier");
604
0
                return 0;
605
0
            }
606
552k
            *conversion = PyUnicode_READ_CHAR(str->str, str->start++);
607
608
552k
            if (str->start < str->end) {
609
552k
                c = PyUnicode_READ_CHAR(str->str, str->start++);
610
552k
                if (c == '}')
611
552k
                    return 1;
612
0
                if (c != ':') {
613
0
                    PyErr_SetString(PyExc_ValueError,
614
0
                                    "expected ':' after conversion specifier");
615
0
                    return 0;
616
0
                }
617
0
            }
618
552k
        }
619
64
        format_spec->str = str->str;
620
64
        format_spec->start = str->start;
621
64
        count = 1;
622
256
        while (str->start < str->end) {
623
256
            switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
624
0
            case '{':
625
0
                *format_spec_needs_expanding = 1;
626
0
                count++;
627
0
                break;
628
64
            case '}':
629
64
                count--;
630
64
                if (count == 0) {
631
64
                    format_spec->end = str->start - 1;
632
64
                    return 1;
633
64
                }
634
0
                break;
635
192
            default:
636
192
                break;
637
256
            }
638
256
        }
639
640
0
        PyErr_SetString(PyExc_ValueError, "unmatched '{' in format spec");
641
0
        return 0;
642
64
    }
643
15.9M
    else if (c != '}') {
644
0
        PyErr_SetString(PyExc_ValueError, "expected '}' before end of string");
645
0
        return 0;
646
0
    }
647
648
15.9M
    return 1;
649
16.4M
}
650
651
/************************************************************************/
652
/******* Output string allocation and escape-to-markup processing  ******/
653
/************************************************************************/
654
655
/* MarkupIterator breaks the string into pieces of either literal
656
   text, or things inside {} that need to be marked up.  it is
657
   designed to make it easy to wrap a Python iterator around it, for
658
   use with the Formatter class */
659
660
typedef struct {
661
    SubString str;
662
} MarkupIterator;
663
664
static int
665
MarkupIterator_init(MarkupIterator *self, PyObject *str,
666
                    Py_ssize_t start, Py_ssize_t end)
667
8.79M
{
668
8.79M
    SubString_init(&self->str, str, start, end);
669
8.79M
    return 1;
670
8.79M
}
671
672
/* returns 0 on error, 1 on non-error termination, and 2 if it got a
673
   string (or something to be expanded) */
674
static int
675
MarkupIterator_next(MarkupIterator *self, SubString *literal,
676
                    int *field_present, SubString *field_name,
677
                    SubString *format_spec, Py_UCS4 *conversion,
678
                    int *format_spec_needs_expanding)
679
33.2M
{
680
33.2M
    int at_end;
681
33.2M
    Py_UCS4 c = 0;
682
33.2M
    Py_ssize_t start;
683
33.2M
    Py_ssize_t len;
684
33.2M
    int markup_follows = 0;
685
686
    /* initialize all of the output variables */
687
33.2M
    SubString_init(literal, NULL, 0, 0);
688
33.2M
    SubString_init(field_name, NULL, 0, 0);
689
33.2M
    SubString_init(format_spec, NULL, 0, 0);
690
33.2M
    *conversion = '\0';
691
33.2M
    *format_spec_needs_expanding = 0;
692
33.2M
    *field_present = 0;
693
694
    /* No more input, end of iterator.  This is the normal exit
695
       path. */
696
33.2M
    if (self->str.start >= self->str.end)
697
8.79M
        return 1;
698
699
24.4M
    start = self->str.start;
700
701
    /* First read any literal text. Read until the end of string, an
702
       escaped '{' or '}', or an unescaped '{'.  In order to never
703
       allocate memory and so I can just pass pointers around, if
704
       there's an escaped '{' or '}' then we'll return the literal
705
       including the brace, but no format object.  The next time
706
       through, we'll return the rest of the literal, skipping past
707
       the second consecutive brace. */
708
76.3M
    while (self->str.start < self->str.end) {
709
68.3M
        switch (c = PyUnicode_READ_CHAR(self->str.str, self->str.start++)) {
710
16.4M
        case '{':
711
16.4M
        case '}':
712
16.4M
            markup_follows = 1;
713
16.4M
            break;
714
51.9M
        default:
715
51.9M
            continue;
716
68.3M
        }
717
16.4M
        break;
718
68.3M
    }
719
720
24.4M
    at_end = self->str.start >= self->str.end;
721
24.4M
    len = self->str.start - start;
722
723
24.4M
    if ((c == '}') && (at_end ||
724
0
                       (c != PyUnicode_READ_CHAR(self->str.str,
725
0
                                                 self->str.start)))) {
726
0
        PyErr_SetString(PyExc_ValueError, "Single '}' encountered "
727
0
                        "in format string");
728
0
        return 0;
729
0
    }
730
24.4M
    if (at_end && c == '{') {
731
0
        PyErr_SetString(PyExc_ValueError, "Single '{' encountered "
732
0
                        "in format string");
733
0
        return 0;
734
0
    }
735
24.4M
    if (!at_end) {
736
16.4M
        if (c == PyUnicode_READ_CHAR(self->str.str, self->str.start)) {
737
            /* escaped } or {, skip it in the input.  there is no
738
               markup object following us, just this literal text */
739
0
            self->str.start++;
740
0
            markup_follows = 0;
741
0
        }
742
16.4M
        else
743
16.4M
            len--;
744
16.4M
    }
745
746
    /* record the literal text */
747
24.4M
    literal->str = self->str.str;
748
24.4M
    literal->start = start;
749
24.4M
    literal->end = start + len;
750
751
24.4M
    if (!markup_follows)
752
7.97M
        return 2;
753
754
    /* this is markup; parse the field */
755
16.4M
    *field_present = 1;
756
16.4M
    if (!parse_field(&self->str, field_name, format_spec,
757
16.4M
                     format_spec_needs_expanding, conversion))
758
0
        return 0;
759
16.4M
    return 2;
760
16.4M
}
761
762
763
/* do the !r or !s conversion on obj */
764
static PyObject *
765
do_conversion(PyObject *obj, Py_UCS4 conversion)
766
552k
{
767
    /* XXX in pre-3.0, do we need to convert this to unicode, since it
768
       might have returned a string? */
769
552k
    switch (conversion) {
770
552k
    case 'r':
771
552k
        return PyObject_Repr(obj);
772
0
    case 's':
773
0
        return PyObject_Str(obj);
774
0
    case 'a':
775
0
        return PyObject_ASCII(obj);
776
0
    default:
777
0
        if (conversion > 32 && conversion < 127) {
778
                /* It's the ASCII subrange; casting to char is safe
779
                   (assuming the execution character set is an ASCII
780
                   superset). */
781
0
                PyErr_Format(PyExc_ValueError,
782
0
                     "Unknown conversion specifier %c",
783
0
                     (char)conversion);
784
0
        } else
785
0
                PyErr_Format(PyExc_ValueError,
786
0
                     "Unknown conversion specifier \\x%x",
787
0
                     (unsigned int)conversion);
788
0
        return NULL;
789
552k
    }
790
552k
}
791
792
/* given:
793
794
   {field_name!conversion:format_spec}
795
796
   compute the result and write it to output.
797
   format_spec_needs_expanding is an optimization.  if it's false,
798
   just output the string directly, otherwise recursively expand the
799
   format_spec string.
800
801
   field_name is allowed to be zero length, in which case we
802
   are doing auto field numbering.
803
*/
804
805
static int
806
output_markup(SubString *field_name, SubString *format_spec,
807
              int format_spec_needs_expanding, Py_UCS4 conversion,
808
              _PyUnicodeWriter *writer, PyObject *args, PyObject *kwargs,
809
              int recursion_depth, AutoNumber *auto_number)
810
16.4M
{
811
16.4M
    PyObject *tmp = NULL;
812
16.4M
    PyObject *fieldobj = NULL;
813
16.4M
    SubString expanded_format_spec;
814
16.4M
    SubString *actual_format_spec;
815
16.4M
    int result = 0;
816
817
    /* convert field_name to an object */
818
16.4M
    fieldobj = get_field_object(field_name, args, kwargs, auto_number);
819
16.4M
    if (fieldobj == NULL)
820
0
        goto done;
821
822
16.4M
    if (conversion != '\0') {
823
552k
        tmp = do_conversion(fieldobj, conversion);
824
552k
        if (tmp == NULL)
825
0
            goto done;
826
827
        /* do the assignment, transferring ownership: fieldobj = tmp */
828
552k
        Py_SETREF(fieldobj, tmp);
829
552k
        tmp = NULL;
830
552k
    }
831
832
    /* if needed, recursively compute the format_spec */
833
16.4M
    if (format_spec_needs_expanding) {
834
0
        tmp = build_string(format_spec, args, kwargs, recursion_depth-1,
835
0
                           auto_number);
836
0
        if (tmp == NULL)
837
0
            goto done;
838
839
        /* note that in the case we're expanding the format string,
840
           tmp must be kept around until after the call to
841
           render_field. */
842
0
        SubString_init(&expanded_format_spec, tmp, 0, PyUnicode_GET_LENGTH(tmp));
843
0
        actual_format_spec = &expanded_format_spec;
844
0
    }
845
16.4M
    else
846
16.4M
        actual_format_spec = format_spec;
847
848
16.4M
    if (render_field(fieldobj, actual_format_spec, writer) == 0)
849
2
        goto done;
850
851
16.4M
    result = 1;
852
853
16.4M
done:
854
16.4M
    Py_XDECREF(fieldobj);
855
16.4M
    Py_XDECREF(tmp);
856
857
16.4M
    return result;
858
16.4M
}
859
860
/*
861
    do_markup is the top-level loop for the format() method.  It
862
    searches through the format string for escapes to markup codes, and
863
    calls other functions to move non-markup text to the output,
864
    and to perform the markup to the output.
865
*/
866
static int
867
do_markup(SubString *input, PyObject *args, PyObject *kwargs,
868
          _PyUnicodeWriter *writer, int recursion_depth, AutoNumber *auto_number)
869
8.79M
{
870
8.79M
    MarkupIterator iter;
871
8.79M
    int format_spec_needs_expanding;
872
8.79M
    int result;
873
8.79M
    int field_present;
874
8.79M
    SubString literal;
875
8.79M
    SubString field_name;
876
8.79M
    SubString format_spec;
877
8.79M
    Py_UCS4 conversion;
878
879
8.79M
    MarkupIterator_init(&iter, input->str, input->start, input->end);
880
33.2M
    while ((result = MarkupIterator_next(&iter, &literal, &field_present,
881
33.2M
                                         &field_name, &format_spec,
882
33.2M
                                         &conversion,
883
33.2M
                                         &format_spec_needs_expanding)) == 2) {
884
24.4M
        if (literal.end != literal.start) {
885
16.8M
            if (!field_present && iter.str.start == iter.str.end)
886
7.97M
                writer->overallocate = 0;
887
16.8M
            if (_PyUnicodeWriter_WriteSubstring(writer, literal.str,
888
16.8M
                                                literal.start, literal.end) < 0)
889
0
                return 0;
890
16.8M
        }
891
892
24.4M
        if (field_present) {
893
16.4M
            if (iter.str.start == iter.str.end)
894
815k
                writer->overallocate = 0;
895
16.4M
            if (!output_markup(&field_name, &format_spec,
896
16.4M
                               format_spec_needs_expanding, conversion, writer,
897
16.4M
                               args, kwargs, recursion_depth, auto_number))
898
2
                return 0;
899
16.4M
        }
900
24.4M
    }
901
8.79M
    return result;
902
8.79M
}
903
904
905
/*
906
    build_string allocates the output string and then
907
    calls do_markup to do the heavy lifting.
908
*/
909
static PyObject *
910
build_string(SubString *input, PyObject *args, PyObject *kwargs,
911
             int recursion_depth, AutoNumber *auto_number)
912
8.79M
{
913
8.79M
    _PyUnicodeWriter writer;
914
915
    /* check the recursion level */
916
8.79M
    if (recursion_depth <= 0) {
917
0
        PyErr_SetString(PyExc_ValueError,
918
0
                        "Max string recursion exceeded");
919
0
        return NULL;
920
0
    }
921
922
8.79M
    _PyUnicodeWriter_Init(&writer);
923
8.79M
    writer.overallocate = 1;
924
8.79M
    writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
925
926
8.79M
    if (!do_markup(input, args, kwargs, &writer, recursion_depth,
927
8.79M
                   auto_number)) {
928
2
        _PyUnicodeWriter_Dealloc(&writer);
929
2
        return NULL;
930
2
    }
931
932
8.79M
    return _PyUnicodeWriter_Finish(&writer);
933
8.79M
}
934
935
/************************************************************************/
936
/*********** main routine ***********************************************/
937
/************************************************************************/
938
939
/* this is the main entry point */
940
static PyObject *
941
do_string_format(PyObject *self, PyObject *args, PyObject *kwargs)
942
8.79M
{
943
8.79M
    SubString input;
944
945
    /* PEP 3101 says only 2 levels, so that
946
       "{0:{1}}".format('abc', 's')            # works
947
       "{0:{1:{2}}}".format('abc', 's', '')    # fails
948
    */
949
8.79M
    int recursion_depth = 2;
950
951
8.79M
    AutoNumber auto_number;
952
8.79M
    AutoNumber_Init(&auto_number);
953
8.79M
    SubString_init(&input, self, 0, PyUnicode_GET_LENGTH(self));
954
8.79M
    return build_string(&input, args, kwargs, recursion_depth, &auto_number);
955
8.79M
}
956
957
static PyObject *
958
do_string_format_map(PyObject *self, PyObject *obj)
959
0
{
960
0
    return do_string_format(self, NULL, obj);
961
0
}
962
963
964
/************************************************************************/
965
/*********** formatteriterator ******************************************/
966
/************************************************************************/
967
968
/* This is used to implement string.Formatter.vparse().  It exists so
969
   Formatter can share code with the built in unicode.format() method.
970
   It's really just a wrapper around MarkupIterator that is callable
971
   from Python. */
972
973
typedef struct {
974
    PyObject_HEAD
975
    PyObject *str;
976
    MarkupIterator it_markup;
977
} formatteriterobject;
978
979
static void
980
formatteriter_dealloc(PyObject *op)
981
0
{
982
0
    formatteriterobject *it = (formatteriterobject*)op;
983
0
    Py_XDECREF(it->str);
984
0
    PyObject_Free(it);
985
0
}
986
987
/* returns a tuple:
988
   (literal, field_name, format_spec, conversion)
989
990
   literal is any literal text to output.  might be zero length
991
   field_name is the string before the ':'.  might be None
992
   format_spec is the string after the ':'.  mibht be None
993
   conversion is either None, or the string after the '!'
994
*/
995
static PyObject *
996
formatteriter_next(PyObject *op)
997
0
{
998
0
    formatteriterobject *it = (formatteriterobject*)op;
999
0
    SubString literal;
1000
0
    SubString field_name;
1001
0
    SubString format_spec;
1002
0
    Py_UCS4 conversion;
1003
0
    int format_spec_needs_expanding;
1004
0
    int field_present;
1005
0
    int result = MarkupIterator_next(&it->it_markup, &literal, &field_present,
1006
0
                                     &field_name, &format_spec, &conversion,
1007
0
                                     &format_spec_needs_expanding);
1008
1009
    /* all of the SubString objects point into it->str, so no
1010
       memory management needs to be done on them */
1011
0
    assert(0 <= result && result <= 2);
1012
0
    if (result == 0 || result == 1)
1013
        /* if 0, error has already been set, if 1, iterator is empty */
1014
0
        return NULL;
1015
0
    else {
1016
0
        PyObject *literal_str = NULL;
1017
0
        PyObject *field_name_str = NULL;
1018
0
        PyObject *format_spec_str = NULL;
1019
0
        PyObject *conversion_str = NULL;
1020
0
        PyObject *tuple = NULL;
1021
1022
0
        literal_str = SubString_new_object(&literal);
1023
0
        if (literal_str == NULL)
1024
0
            goto done;
1025
1026
0
        field_name_str = SubString_new_object(&field_name);
1027
0
        if (field_name_str == NULL)
1028
0
            goto done;
1029
1030
        /* if field_name is non-zero length, return a string for
1031
           format_spec (even if zero length), else return None */
1032
0
        format_spec_str = (field_present ?
1033
0
                           SubString_new_object_or_empty :
1034
0
                           SubString_new_object)(&format_spec);
1035
0
        if (format_spec_str == NULL)
1036
0
            goto done;
1037
1038
        /* if the conversion is not specified, return a None,
1039
           otherwise create a one length string with the conversion
1040
           character */
1041
0
        if (conversion == '\0') {
1042
0
            conversion_str = Py_NewRef(Py_None);
1043
0
        }
1044
0
        else
1045
0
            conversion_str = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
1046
0
                                                       &conversion, 1);
1047
0
        if (conversion_str == NULL)
1048
0
            goto done;
1049
1050
0
        tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str,
1051
0
                             conversion_str);
1052
0
    done:
1053
0
        Py_XDECREF(literal_str);
1054
0
        Py_XDECREF(field_name_str);
1055
0
        Py_XDECREF(format_spec_str);
1056
0
        Py_XDECREF(conversion_str);
1057
0
        return tuple;
1058
0
    }
1059
0
}
1060
1061
static PyMethodDef formatteriter_methods[] = {
1062
    {NULL,              NULL}           /* sentinel */
1063
};
1064
1065
static PyTypeObject PyFormatterIter_Type = {
1066
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
1067
    "formatteriterator",                /* tp_name */
1068
    sizeof(formatteriterobject),        /* tp_basicsize */
1069
    0,                                  /* tp_itemsize */
1070
    /* methods */
1071
    formatteriter_dealloc,              /* tp_dealloc */
1072
    0,                                  /* tp_vectorcall_offset */
1073
    0,                                  /* tp_getattr */
1074
    0,                                  /* tp_setattr */
1075
    0,                                  /* tp_as_async */
1076
    0,                                  /* tp_repr */
1077
    0,                                  /* tp_as_number */
1078
    0,                                  /* tp_as_sequence */
1079
    0,                                  /* tp_as_mapping */
1080
    0,                                  /* tp_hash */
1081
    0,                                  /* tp_call */
1082
    0,                                  /* tp_str */
1083
    PyObject_GenericGetAttr,            /* tp_getattro */
1084
    0,                                  /* tp_setattro */
1085
    0,                                  /* tp_as_buffer */
1086
    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1087
    0,                                  /* tp_doc */
1088
    0,                                  /* tp_traverse */
1089
    0,                                  /* tp_clear */
1090
    0,                                  /* tp_richcompare */
1091
    0,                                  /* tp_weaklistoffset */
1092
    PyObject_SelfIter,                  /* tp_iter */
1093
    formatteriter_next,                 /* tp_iternext */
1094
    formatteriter_methods,              /* tp_methods */
1095
    0,
1096
};
1097
1098
/* unicode_formatter_parser is used to implement
1099
   string.Formatter.vformat.  it parses a string and returns tuples
1100
   describing the parsed elements.  It's a wrapper around
1101
   stringlib/string_format.h's MarkupIterator */
1102
static PyObject *
1103
formatter_parser(PyObject *Py_UNUSED(module), PyObject *self)
1104
0
{
1105
0
    formatteriterobject *it;
1106
1107
0
    if (!PyUnicode_Check(self)) {
1108
0
        PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1109
0
        return NULL;
1110
0
    }
1111
1112
0
    it = PyObject_New(formatteriterobject, &PyFormatterIter_Type);
1113
0
    if (it == NULL)
1114
0
        return NULL;
1115
1116
    /* take ownership, give the object to the iterator */
1117
0
    it->str = Py_NewRef(self);
1118
1119
    /* initialize the contained MarkupIterator */
1120
0
    MarkupIterator_init(&it->it_markup, (PyObject*)self, 0, PyUnicode_GET_LENGTH(self));
1121
0
    return (PyObject *)it;
1122
0
}
1123
1124
1125
/************************************************************************/
1126
/*********** fieldnameiterator ******************************************/
1127
/************************************************************************/
1128
1129
1130
/* This is used to implement string.Formatter.vparse().  It parses the
1131
   field name into attribute and item values.  It's a Python-callable
1132
   wrapper around FieldNameIterator */
1133
1134
typedef struct {
1135
    PyObject_HEAD
1136
    PyObject *str;
1137
    FieldNameIterator it_field;
1138
} fieldnameiterobject;
1139
1140
static void
1141
fieldnameiter_dealloc(PyObject *op)
1142
0
{
1143
0
    fieldnameiterobject *it = (fieldnameiterobject*)op;
1144
0
    Py_XDECREF(it->str);
1145
0
    PyObject_Free(it);
1146
0
}
1147
1148
/* returns a tuple:
1149
   (is_attr, value)
1150
   is_attr is true if we used attribute syntax (e.g., '.foo')
1151
              false if we used index syntax (e.g., '[foo]')
1152
   value is an integer or string
1153
*/
1154
static PyObject *
1155
fieldnameiter_next(PyObject *op)
1156
0
{
1157
0
    fieldnameiterobject *it = (fieldnameiterobject*)op;
1158
0
    int result;
1159
0
    int is_attr;
1160
0
    Py_ssize_t idx;
1161
0
    SubString name;
1162
1163
0
    result = FieldNameIterator_next(&it->it_field, &is_attr,
1164
0
                                    &idx, &name);
1165
0
    if (result == 0 || result == 1)
1166
        /* if 0, error has already been set, if 1, iterator is empty */
1167
0
        return NULL;
1168
0
    else {
1169
0
        PyObject* result = NULL;
1170
0
        PyObject* is_attr_obj = NULL;
1171
0
        PyObject* obj = NULL;
1172
1173
0
        is_attr_obj = PyBool_FromLong(is_attr);
1174
0
        if (is_attr_obj == NULL)
1175
0
            goto done;
1176
1177
        /* either an integer or a string */
1178
0
        if (idx != -1)
1179
0
            obj = PyLong_FromSsize_t(idx);
1180
0
        else
1181
0
            obj = SubString_new_object(&name);
1182
0
        if (obj == NULL)
1183
0
            goto done;
1184
1185
        /* return a tuple of values */
1186
0
        result = PyTuple_Pack(2, is_attr_obj, obj);
1187
1188
0
    done:
1189
0
        Py_XDECREF(is_attr_obj);
1190
0
        Py_XDECREF(obj);
1191
0
        return result;
1192
0
    }
1193
0
}
1194
1195
static PyMethodDef fieldnameiter_methods[] = {
1196
    {NULL,              NULL}           /* sentinel */
1197
};
1198
1199
static PyTypeObject PyFieldNameIter_Type = {
1200
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
1201
    "fieldnameiterator",                /* tp_name */
1202
    sizeof(fieldnameiterobject),        /* tp_basicsize */
1203
    0,                                  /* tp_itemsize */
1204
    /* methods */
1205
    fieldnameiter_dealloc,              /* tp_dealloc */
1206
    0,                                  /* tp_vectorcall_offset */
1207
    0,                                  /* tp_getattr */
1208
    0,                                  /* tp_setattr */
1209
    0,                                  /* tp_as_async */
1210
    0,                                  /* tp_repr */
1211
    0,                                  /* tp_as_number */
1212
    0,                                  /* tp_as_sequence */
1213
    0,                                  /* tp_as_mapping */
1214
    0,                                  /* tp_hash */
1215
    0,                                  /* tp_call */
1216
    0,                                  /* tp_str */
1217
    PyObject_GenericGetAttr,            /* tp_getattro */
1218
    0,                                  /* tp_setattro */
1219
    0,                                  /* tp_as_buffer */
1220
    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1221
    0,                                  /* tp_doc */
1222
    0,                                  /* tp_traverse */
1223
    0,                                  /* tp_clear */
1224
    0,                                  /* tp_richcompare */
1225
    0,                                  /* tp_weaklistoffset */
1226
    PyObject_SelfIter,                  /* tp_iter */
1227
    fieldnameiter_next,                 /* tp_iternext */
1228
    fieldnameiter_methods,              /* tp_methods */
1229
    0};
1230
1231
/* unicode_formatter_field_name_split is used to implement
1232
   string.Formatter.vformat.  it takes a PEP 3101 "field name", and
1233
   returns a tuple of (first, rest): "first", the part before the
1234
   first '.' or '['; and "rest", an iterator for the rest of the field
1235
   name.  it's a wrapper around stringlib/string_format.h's
1236
   field_name_split.  The iterator it returns is a
1237
   FieldNameIterator */
1238
static PyObject *
1239
formatter_field_name_split(PyObject *Py_UNUSED(module), PyObject *self)
1240
0
{
1241
0
    SubString first;
1242
0
    Py_ssize_t first_idx;
1243
0
    fieldnameiterobject *it;
1244
1245
0
    PyObject *first_obj = NULL;
1246
0
    PyObject *result = NULL;
1247
1248
0
    if (!PyUnicode_Check(self)) {
1249
0
        PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1250
0
        return NULL;
1251
0
    }
1252
1253
0
    it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type);
1254
0
    if (it == NULL)
1255
0
        return NULL;
1256
1257
    /* take ownership, give the object to the iterator.  this is
1258
       just to keep the field_name alive */
1259
0
    it->str = Py_NewRef(self);
1260
1261
    /* Pass in auto_number = NULL. We'll return an empty string for
1262
       first_obj in that case. */
1263
0
    if (!field_name_split((PyObject*)self, 0, PyUnicode_GET_LENGTH(self),
1264
0
                          &first, &first_idx, &it->it_field, NULL))
1265
0
        goto done;
1266
1267
    /* first becomes an integer, if possible; else a string */
1268
0
    if (first_idx != -1)
1269
0
        first_obj = PyLong_FromSsize_t(first_idx);
1270
0
    else
1271
        /* convert "first" into a string object */
1272
0
        first_obj = SubString_new_object(&first);
1273
0
    if (first_obj == NULL)
1274
0
        goto done;
1275
1276
    /* return a tuple of values */
1277
0
    result = PyTuple_Pack(2, first_obj, it);
1278
1279
0
done:
1280
0
    Py_XDECREF(it);
1281
0
    Py_XDECREF(first_obj);
1282
0
    return result;
1283
0
}