Coverage Report

Created: 2025-07-11 06:24

/src/cpython/Objects/unicodeobject.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_Repeat()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_freelist.h"      // _Py_FREELIST_FREE(), _Py_FREELIST_POP()
50
#include "pycore_initconfig.h"    // _PyStatus_OK()
51
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
52
#include "pycore_long.h"          // _PyLong_FormatWriter()
53
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
54
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
55
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
56
#include "pycore_pyhash.h"        // _Py_HashSecret_t
57
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
58
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
59
#include "pycore_template.h"      // _PyTemplate_Concat()
60
#include "pycore_tuple.h"         // _PyTuple_FromArray()
61
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
62
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
63
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
64
65
#include "stringlib/eq.h"         // unicode_eq()
66
#include <stddef.h>               // ptrdiff_t
67
68
#ifdef MS_WINDOWS
69
#include <windows.h>
70
#endif
71
72
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
73
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
74
#endif
75
76
/* Uncomment to display statistics on interned strings at exit
77
   in _PyUnicode_ClearInterned(). */
78
/* #define INTERNED_STATS 1 */
79
80
81
/*[clinic input]
82
class str "PyObject *" "&PyUnicode_Type"
83
[clinic start generated code]*/
84
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
85
86
/*[python input]
87
class Py_UCS4_converter(CConverter):
88
    type = 'Py_UCS4'
89
    converter = 'convert_uc'
90
91
    def converter_init(self):
92
        if self.default is not unspecified:
93
            self.c_default = ascii(self.default)
94
            if len(self.c_default) > 4 or self.c_default[0] != "'":
95
                self.c_default = hex(ord(self.default))
96
97
[python start generated code]*/
98
/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
99
100
/* --- Globals ------------------------------------------------------------
101
102
NOTE: In the interpreter's initialization phase, some globals are currently
103
      initialized dynamically as needed. In the process Unicode objects may
104
      be created before the Unicode type is ready.
105
106
*/
107
108
// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
109
// The value must be the same in fileutils.c.
110
74.9M
#define MAX_UNICODE 0x10ffff
111
112
#ifdef Py_DEBUG
113
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
114
#else
115
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
116
#endif
117
118
#ifdef Py_GIL_DISABLED
119
#  define LOCK_INTERNED(interp) PyMutex_Lock(&_Py_INTERP_CACHED_OBJECT(interp, interned_mutex))
120
#  define UNLOCK_INTERNED(interp) PyMutex_Unlock(&_Py_INTERP_CACHED_OBJECT(interp, interned_mutex))
121
#else
122
#  define LOCK_INTERNED(interp)
123
#  define UNLOCK_INTERNED(interp)
124
#endif
125
126
static inline char* _PyUnicode_UTF8(PyObject *op)
127
201M
{
128
201M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
129
201M
}
130
131
static inline char* PyUnicode_UTF8(PyObject *op)
132
50.5M
{
133
50.5M
    assert(_PyUnicode_CHECK(op));
134
50.5M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
135
38.6M
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
136
38.6M
    }
137
11.8M
    else {
138
11.8M
         return _PyUnicode_UTF8(op);
139
11.8M
    }
140
50.5M
}
141
142
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
143
18.9M
{
144
18.9M
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
145
18.9M
}
146
147
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
148
22.3M
{
149
22.3M
    assert(_PyUnicode_CHECK(op));
150
22.3M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
151
19.2M
         return _PyASCIIObject_CAST(op)->length;
152
19.2M
    }
153
3.05M
    else {
154
3.05M
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
155
3.05M
    }
156
22.3M
}
157
158
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
159
18.9M
{
160
18.9M
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
161
18.9M
}
162
163
#define _PyUnicode_LENGTH(op)                           \
164
481M
    (_PyASCIIObject_CAST(op)->length)
165
#define _PyUnicode_STATE(op)                            \
166
3.00G
    (_PyASCIIObject_CAST(op)->state)
167
#define _PyUnicode_HASH(op)                             \
168
437M
    (_PyASCIIObject_CAST(op)->hash)
169
170
92.9M
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
171
172
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
173
37.3M
{
174
37.3M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
175
37.3M
}
176
177
#define _PyUnicode_DATA_ANY(op)                         \
178
39.9M
    (_PyUnicodeObject_CAST(op)->data.any)
179
180
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
181
0
{
182
0
    assert(_PyUnicode_CHECK(op));
183
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
184
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
185
0
}
186
187
/* true if the Unicode object has an allocated UTF-8 memory block
188
   (not shared with other data) */
189
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
190
481M
{
191
481M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
192
481M
            && _PyUnicode_UTF8(op) != NULL
193
481M
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
194
481M
}
195
196
197
/* Generic helper macro to convert characters of different types.
198
   from_type and to_type have to be valid type names, begin and end
199
   are pointers to the source characters which should be of type
200
   "from_type *".  to is a pointer of type "to_type *" and points to the
201
   buffer where the result characters are written to. */
202
#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
203
151M
    do {                                                \
204
151M
        to_type *_to = (to_type *)(to);                 \
205
151M
        const from_type *_iter = (const from_type *)(begin);\
206
151M
        const from_type *_end = (const from_type *)(end);\
207
151M
        Py_ssize_t n = (_end) - (_iter);                \
208
151M
        const from_type *_unrolled_end =                \
209
151M
            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
210
971M
        while (_iter < (_unrolled_end)) {               \
211
819M
            _to[0] = (to_type) _iter[0];                \
212
819M
            _to[1] = (to_type) _iter[1];                \
213
819M
            _to[2] = (to_type) _iter[2];                \
214
819M
            _to[3] = (to_type) _iter[3];                \
215
819M
            _iter += 4; _to += 4;                       \
216
819M
        }                                               \
217
349M
        while (_iter < (_end))                          \
218
197M
            *_to++ = (to_type) *_iter++;                \
219
151M
    } while (0)
220
221
221M
#define LATIN1 _Py_LATIN1_CHR
222
223
#ifdef MS_WINDOWS
224
   /* On Windows, overallocate by 50% is the best factor */
225
#  define OVERALLOCATE_FACTOR 2
226
#else
227
   /* On Linux, overallocate by 25% is the best factor */
228
92.2M
#  define OVERALLOCATE_FACTOR 4
229
#endif
230
231
/* Forward declaration */
232
static inline int
233
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
234
static inline void
235
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
236
static PyObject *
237
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
238
                    const char *errors);
239
static PyObject *
240
unicode_decode_utf8(const char *s, Py_ssize_t size,
241
                    _Py_error_handler error_handler, const char *errors,
242
                    Py_ssize_t *consumed);
243
static int
244
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
245
                           const char *s, Py_ssize_t size,
246
                           _Py_error_handler error_handler, const char *errors,
247
                           Py_ssize_t *consumed);
248
#ifdef Py_DEBUG
249
static inline int unicode_is_finalizing(void);
250
static int unicode_is_singleton(PyObject *unicode);
251
#endif
252
253
254
// Return a reference to the immortal empty string singleton.
255
static inline PyObject* unicode_get_empty(void)
256
103M
{
257
103M
    _Py_DECLARE_STR(empty, "");
258
103M
    return &_Py_STR(empty);
259
103M
}
260
261
/* This dictionary holds per-interpreter interned strings.
262
 * See InternalDocs/string_interning.md for details.
263
 */
264
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
265
3.06M
{
266
3.06M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
267
3.06M
}
268
269
/* This hashtable holds statically allocated interned strings.
270
 * See InternalDocs/string_interning.md for details.
271
 */
272
2.88M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
273
274
/* Get number of all interned strings for the current interpreter. */
275
Py_ssize_t
276
_PyUnicode_InternedSize(void)
277
0
{
278
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
279
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
280
0
}
281
282
/* Get number of immortal interned strings for the current interpreter. */
283
Py_ssize_t
284
_PyUnicode_InternedSize_Immortal(void)
285
0
{
286
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
287
0
    PyObject *key, *value;
288
0
    Py_ssize_t pos = 0;
289
0
    Py_ssize_t count = 0;
290
291
    // It's tempting to keep a count and avoid a loop here. But, this function
292
    // is intended for refleak tests. It spends extra work to report the true
293
    // value, to help detect bugs in optimizations.
294
295
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
296
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
297
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
298
0
           count++;
299
0
       }
300
0
    }
301
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
302
0
}
303
304
static Py_hash_t unicode_hash(PyObject *);
305
306
static Py_uhash_t
307
hashtable_unicode_hash(const void *key)
308
2.88M
{
309
2.88M
    return unicode_hash((PyObject *)key);
310
2.88M
}
311
312
static int
313
hashtable_unicode_compare(const void *key1, const void *key2)
314
249k
{
315
249k
    PyObject *obj1 = (PyObject *)key1;
316
249k
    PyObject *obj2 = (PyObject *)key2;
317
249k
    if (obj1 != NULL && obj2 != NULL) {
318
249k
        return unicode_eq(obj1, obj2);
319
249k
    }
320
0
    else {
321
0
        return obj1 == obj2;
322
0
    }
323
249k
}
324
325
/* Return true if this interpreter should share the main interpreter's
326
   intern_dict.  That's important for interpreters which load basic
327
   single-phase init extension modules (m_size == -1).  There could be interned
328
   immortal strings that are shared between interpreters, due to the
329
   PyDict_Update(mdict, m_copy) call in import_find_extension().
330
331
   It's not safe to deallocate those strings until all interpreters that
332
   potentially use them are freed.  By storing them in the main interpreter, we
333
   ensure they get freed after all other interpreters are freed.
334
*/
335
static bool
336
has_shared_intern_dict(PyInterpreterState *interp)
337
16
{
338
16
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
339
16
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
340
16
}
341
342
static int
343
init_interned_dict(PyInterpreterState *interp)
344
16
{
345
16
    assert(get_interned_dict(interp) == NULL);
346
16
    PyObject *interned;
347
16
    if (has_shared_intern_dict(interp)) {
348
0
        interned = get_interned_dict(_PyInterpreterState_Main());
349
0
        Py_INCREF(interned);
350
0
    }
351
16
    else {
352
16
        interned = PyDict_New();
353
16
        if (interned == NULL) {
354
0
            return -1;
355
0
        }
356
16
    }
357
16
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
358
16
    return 0;
359
16
}
360
361
static void
362
clear_interned_dict(PyInterpreterState *interp)
363
0
{
364
0
    PyObject *interned = get_interned_dict(interp);
365
0
    if (interned != NULL) {
366
0
        if (!has_shared_intern_dict(interp)) {
367
            // only clear if the dict belongs to this interpreter
368
0
            PyDict_Clear(interned);
369
0
        }
370
0
        Py_DECREF(interned);
371
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
372
0
    }
373
0
}
374
375
static PyStatus
376
init_global_interned_strings(PyInterpreterState *interp)
377
16
{
378
16
    assert(INTERNED_STRINGS == NULL);
379
16
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
380
381
16
    INTERNED_STRINGS = _Py_hashtable_new_full(
382
16
        hashtable_unicode_hash,
383
16
        hashtable_unicode_compare,
384
        // Objects stored here are immortal and statically allocated,
385
        // so we don't need key_destroy_func & value_destroy_func:
386
16
        NULL,
387
16
        NULL,
388
16
        &hashtable_alloc
389
16
    );
390
16
    if (INTERNED_STRINGS == NULL) {
391
0
        PyErr_Clear();
392
0
        return _PyStatus_ERR("failed to create global interned dict");
393
0
    }
394
395
    /* Intern statically allocated string identifiers, deepfreeze strings,
396
        * and one-byte latin-1 strings.
397
        * This must be done before any module initialization so that statically
398
        * allocated string identifiers are used instead of heap allocated strings.
399
        * Deepfreeze uses the interned identifiers if present to save space
400
        * else generates them and they are interned to speed up dict lookups.
401
    */
402
16
    _PyUnicode_InitStaticStrings(interp);
403
404
4.11k
    for (int i = 0; i < 256; i++) {
405
4.09k
        PyObject *s = LATIN1(i);
406
4.09k
        _PyUnicode_InternStatic(interp, &s);
407
4.09k
        assert(s == LATIN1(i));
408
4.09k
    }
409
#ifdef Py_DEBUG
410
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
411
412
    for (int i = 0; i < 256; i++) {
413
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
414
    }
415
#endif
416
16
    return _PyStatus_OK();
417
16
}
418
419
static void clear_global_interned_strings(void)
420
0
{
421
0
    if (INTERNED_STRINGS != NULL) {
422
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
423
0
        INTERNED_STRINGS = NULL;
424
0
    }
425
0
}
426
427
#define _Py_RETURN_UNICODE_EMPTY()   \
428
44.7M
    do {                             \
429
44.7M
        return unicode_get_empty();  \
430
44.7M
    } while (0)
431
432
static inline void
433
unicode_fill(int kind, void *data, Py_UCS4 value,
434
             Py_ssize_t start, Py_ssize_t length)
435
11.7M
{
436
11.7M
    assert(0 <= start);
437
11.7M
    switch (kind) {
438
3.37M
    case PyUnicode_1BYTE_KIND: {
439
3.37M
        assert(value <= 0xff);
440
3.37M
        Py_UCS1 ch = (unsigned char)value;
441
3.37M
        Py_UCS1 *to = (Py_UCS1 *)data + start;
442
3.37M
        memset(to, ch, length);
443
3.37M
        break;
444
0
    }
445
5.57M
    case PyUnicode_2BYTE_KIND: {
446
5.57M
        assert(value <= 0xffff);
447
5.57M
        Py_UCS2 ch = (Py_UCS2)value;
448
5.57M
        Py_UCS2 *to = (Py_UCS2 *)data + start;
449
5.57M
        const Py_UCS2 *end = to + length;
450
49.6M
        for (; to < end; ++to) *to = ch;
451
5.57M
        break;
452
0
    }
453
2.82M
    case PyUnicode_4BYTE_KIND: {
454
2.82M
        assert(value <= MAX_UNICODE);
455
2.82M
        Py_UCS4 ch = value;
456
2.82M
        Py_UCS4 * to = (Py_UCS4 *)data + start;
457
2.82M
        const Py_UCS4 *end = to + length;
458
24.5M
        for (; to < end; ++to) *to = ch;
459
2.82M
        break;
460
0
    }
461
0
    default: Py_UNREACHABLE();
462
11.7M
    }
463
11.7M
}
464
465
466
/* Fast detection of the most frequent whitespace characters */
467
const unsigned char _Py_ascii_whitespace[] = {
468
    0, 0, 0, 0, 0, 0, 0, 0,
469
/*     case 0x0009: * CHARACTER TABULATION */
470
/*     case 0x000A: * LINE FEED */
471
/*     case 0x000B: * LINE TABULATION */
472
/*     case 0x000C: * FORM FEED */
473
/*     case 0x000D: * CARRIAGE RETURN */
474
    0, 1, 1, 1, 1, 1, 0, 0,
475
    0, 0, 0, 0, 0, 0, 0, 0,
476
/*     case 0x001C: * FILE SEPARATOR */
477
/*     case 0x001D: * GROUP SEPARATOR */
478
/*     case 0x001E: * RECORD SEPARATOR */
479
/*     case 0x001F: * UNIT SEPARATOR */
480
    0, 0, 0, 0, 1, 1, 1, 1,
481
/*     case 0x0020: * SPACE */
482
    1, 0, 0, 0, 0, 0, 0, 0,
483
    0, 0, 0, 0, 0, 0, 0, 0,
484
    0, 0, 0, 0, 0, 0, 0, 0,
485
    0, 0, 0, 0, 0, 0, 0, 0,
486
487
    0, 0, 0, 0, 0, 0, 0, 0,
488
    0, 0, 0, 0, 0, 0, 0, 0,
489
    0, 0, 0, 0, 0, 0, 0, 0,
490
    0, 0, 0, 0, 0, 0, 0, 0,
491
    0, 0, 0, 0, 0, 0, 0, 0,
492
    0, 0, 0, 0, 0, 0, 0, 0,
493
    0, 0, 0, 0, 0, 0, 0, 0,
494
    0, 0, 0, 0, 0, 0, 0, 0
495
};
496
497
/* forward */
498
static PyObject* get_latin1_char(unsigned char ch);
499
static int unicode_modifiable(PyObject *unicode);
500
501
502
static PyObject *
503
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
504
static PyObject *
505
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
506
static PyObject *
507
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
508
509
static PyObject *
510
unicode_encode_call_errorhandler(const char *errors,
511
       PyObject **errorHandler,const char *encoding, const char *reason,
512
       PyObject *unicode, PyObject **exceptionObject,
513
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
514
515
static void
516
raise_encode_exception(PyObject **exceptionObject,
517
                       const char *encoding,
518
                       PyObject *unicode,
519
                       Py_ssize_t startpos, Py_ssize_t endpos,
520
                       const char *reason);
521
522
/* Same for linebreaks */
523
static const unsigned char ascii_linebreak[] = {
524
    0, 0, 0, 0, 0, 0, 0, 0,
525
/*         0x000A, * LINE FEED */
526
/*         0x000B, * LINE TABULATION */
527
/*         0x000C, * FORM FEED */
528
/*         0x000D, * CARRIAGE RETURN */
529
    0, 0, 1, 1, 1, 1, 0, 0,
530
    0, 0, 0, 0, 0, 0, 0, 0,
531
/*         0x001C, * FILE SEPARATOR */
532
/*         0x001D, * GROUP SEPARATOR */
533
/*         0x001E, * RECORD SEPARATOR */
534
    0, 0, 0, 0, 1, 1, 1, 0,
535
    0, 0, 0, 0, 0, 0, 0, 0,
536
    0, 0, 0, 0, 0, 0, 0, 0,
537
    0, 0, 0, 0, 0, 0, 0, 0,
538
    0, 0, 0, 0, 0, 0, 0, 0,
539
540
    0, 0, 0, 0, 0, 0, 0, 0,
541
    0, 0, 0, 0, 0, 0, 0, 0,
542
    0, 0, 0, 0, 0, 0, 0, 0,
543
    0, 0, 0, 0, 0, 0, 0, 0,
544
    0, 0, 0, 0, 0, 0, 0, 0,
545
    0, 0, 0, 0, 0, 0, 0, 0,
546
    0, 0, 0, 0, 0, 0, 0, 0,
547
    0, 0, 0, 0, 0, 0, 0, 0
548
};
549
550
static int convert_uc(PyObject *obj, void *addr);
551
552
struct encoding_map;
553
#include "clinic/unicodeobject.c.h"
554
555
_Py_error_handler
556
_Py_GetErrorHandler(const char *errors)
557
499k
{
558
499k
    if (errors == NULL || strcmp(errors, "strict") == 0) {
559
185k
        return _Py_ERROR_STRICT;
560
185k
    }
561
313k
    if (strcmp(errors, "surrogateescape") == 0) {
562
163k
        return _Py_ERROR_SURROGATEESCAPE;
563
163k
    }
564
149k
    if (strcmp(errors, "replace") == 0) {
565
149k
        return _Py_ERROR_REPLACE;
566
149k
    }
567
0
    if (strcmp(errors, "ignore") == 0) {
568
0
        return _Py_ERROR_IGNORE;
569
0
    }
570
0
    if (strcmp(errors, "backslashreplace") == 0) {
571
0
        return _Py_ERROR_BACKSLASHREPLACE;
572
0
    }
573
0
    if (strcmp(errors, "surrogatepass") == 0) {
574
0
        return _Py_ERROR_SURROGATEPASS;
575
0
    }
576
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
577
0
        return _Py_ERROR_XMLCHARREFREPLACE;
578
0
    }
579
0
    return _Py_ERROR_OTHER;
580
0
}
581
582
583
static _Py_error_handler
584
get_error_handler_wide(const wchar_t *errors)
585
5.55k
{
586
5.55k
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
587
0
        return _Py_ERROR_STRICT;
588
0
    }
589
5.55k
    if (wcscmp(errors, L"surrogateescape") == 0) {
590
5.55k
        return _Py_ERROR_SURROGATEESCAPE;
591
5.55k
    }
592
0
    if (wcscmp(errors, L"replace") == 0) {
593
0
        return _Py_ERROR_REPLACE;
594
0
    }
595
0
    if (wcscmp(errors, L"ignore") == 0) {
596
0
        return _Py_ERROR_IGNORE;
597
0
    }
598
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
599
0
        return _Py_ERROR_BACKSLASHREPLACE;
600
0
    }
601
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
602
0
        return _Py_ERROR_SURROGATEPASS;
603
0
    }
604
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
605
0
        return _Py_ERROR_XMLCHARREFREPLACE;
606
0
    }
607
0
    return _Py_ERROR_OTHER;
608
0
}
609
610
611
static inline int
612
unicode_check_encoding_errors(const char *encoding, const char *errors)
613
18.0M
{
614
18.0M
    if (encoding == NULL && errors == NULL) {
615
10.3M
        return 0;
616
10.3M
    }
617
618
7.66M
    PyInterpreterState *interp = _PyInterpreterState_GET();
619
7.66M
#ifndef Py_DEBUG
620
    /* In release mode, only check in development mode (-X dev) */
621
7.66M
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
622
7.66M
        return 0;
623
7.66M
    }
624
#else
625
    /* Always check in debug mode */
626
#endif
627
628
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
629
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
630
0
    if (!interp->unicode.fs_codec.encoding) {
631
0
        return 0;
632
0
    }
633
634
    /* Disable checks during Python finalization. For example, it allows to
635
       call _PyObject_Dump() during finalization for debugging purpose. */
636
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
637
0
        return 0;
638
0
    }
639
640
0
    if (encoding != NULL
641
        // Fast path for the most common built-in encodings. Even if the codec
642
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
643
        // create a temporary Unicode string (the key in the cache).
644
0
        && strcmp(encoding, "utf-8") != 0
645
0
        && strcmp(encoding, "utf8") != 0
646
0
        && strcmp(encoding, "ascii") != 0)
647
0
    {
648
0
        PyObject *handler = _PyCodec_Lookup(encoding);
649
0
        if (handler == NULL) {
650
0
            return -1;
651
0
        }
652
0
        Py_DECREF(handler);
653
0
    }
654
655
0
    if (errors != NULL
656
        // Fast path for the most common built-in error handlers.
657
0
        && strcmp(errors, "strict") != 0
658
0
        && strcmp(errors, "ignore") != 0
659
0
        && strcmp(errors, "replace") != 0
660
0
        && strcmp(errors, "surrogateescape") != 0
661
0
        && strcmp(errors, "surrogatepass") != 0)
662
0
    {
663
0
        PyObject *handler = PyCodec_LookupError(errors);
664
0
        if (handler == NULL) {
665
0
            return -1;
666
0
        }
667
0
        Py_DECREF(handler);
668
0
    }
669
0
    return 0;
670
0
}
671
672
673
int
674
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
675
0
{
676
0
#define CHECK(expr) \
677
0
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
678
679
0
    assert(op != NULL);
680
0
    CHECK(PyUnicode_Check(op));
681
682
0
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
683
0
    int kind = ascii->state.kind;
684
685
0
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
686
0
        CHECK(kind == PyUnicode_1BYTE_KIND);
687
0
    }
688
0
    else {
689
0
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
690
0
        void *data;
691
692
0
        if (ascii->state.compact == 1) {
693
0
            data = compact + 1;
694
0
            CHECK(kind == PyUnicode_1BYTE_KIND
695
0
                                 || kind == PyUnicode_2BYTE_KIND
696
0
                                 || kind == PyUnicode_4BYTE_KIND);
697
0
            CHECK(ascii->state.ascii == 0);
698
0
            CHECK(_PyUnicode_UTF8(op) != data);
699
0
        }
700
0
        else {
701
0
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
702
703
0
            data = unicode->data.any;
704
0
            CHECK(kind == PyUnicode_1BYTE_KIND
705
0
                     || kind == PyUnicode_2BYTE_KIND
706
0
                     || kind == PyUnicode_4BYTE_KIND);
707
0
            CHECK(ascii->state.compact == 0);
708
0
            CHECK(data != NULL);
709
0
            if (ascii->state.ascii) {
710
0
                CHECK(_PyUnicode_UTF8(op) == data);
711
0
                CHECK(compact->utf8_length == ascii->length);
712
0
            }
713
0
            else {
714
0
                CHECK(_PyUnicode_UTF8(op) != data);
715
0
            }
716
0
        }
717
0
#ifndef Py_GIL_DISABLED
718
0
        if (_PyUnicode_UTF8(op) == NULL)
719
0
            CHECK(compact->utf8_length == 0);
720
0
#endif
721
0
    }
722
723
    /* check that the best kind is used: O(n) operation */
724
0
    if (check_content) {
725
0
        Py_ssize_t i;
726
0
        Py_UCS4 maxchar = 0;
727
0
        const void *data;
728
0
        Py_UCS4 ch;
729
730
0
        data = PyUnicode_DATA(ascii);
731
0
        for (i=0; i < ascii->length; i++)
732
0
        {
733
0
            ch = PyUnicode_READ(kind, data, i);
734
0
            if (ch > maxchar)
735
0
                maxchar = ch;
736
0
        }
737
0
        if (kind == PyUnicode_1BYTE_KIND) {
738
0
            if (ascii->state.ascii == 0) {
739
0
                CHECK(maxchar >= 128);
740
0
                CHECK(maxchar <= 255);
741
0
            }
742
0
            else
743
0
                CHECK(maxchar < 128);
744
0
        }
745
0
        else if (kind == PyUnicode_2BYTE_KIND) {
746
0
            CHECK(maxchar >= 0x100);
747
0
            CHECK(maxchar <= 0xFFFF);
748
0
        }
749
0
        else {
750
0
            CHECK(maxchar >= 0x10000);
751
0
            CHECK(maxchar <= MAX_UNICODE);
752
0
        }
753
0
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
754
0
    }
755
756
    /* Check interning state */
757
#ifdef Py_DEBUG
758
    // Note that we do not check `_Py_IsImmortal(op)`, since stable ABI
759
    // extensions can make immortal strings mortal (but with a high enough
760
    // refcount).
761
    // The other way is extremely unlikely (worth a potential failed assertion
762
    // in a debug build), so we do check `!_Py_IsImmortal(op)`.
763
    switch (PyUnicode_CHECK_INTERNED(op)) {
764
        case SSTATE_NOT_INTERNED:
765
            if (ascii->state.statically_allocated) {
766
                // This state is for two exceptions:
767
                // - strings are currently checked before they're interned
768
                // - the 256 one-latin1-character strings
769
                //   are static but use SSTATE_NOT_INTERNED
770
            }
771
            else {
772
                CHECK(!_Py_IsImmortal(op));
773
            }
774
            break;
775
        case SSTATE_INTERNED_MORTAL:
776
            CHECK(!ascii->state.statically_allocated);
777
            CHECK(!_Py_IsImmortal(op));
778
            break;
779
        case SSTATE_INTERNED_IMMORTAL:
780
            CHECK(!ascii->state.statically_allocated);
781
            break;
782
        case SSTATE_INTERNED_IMMORTAL_STATIC:
783
            CHECK(ascii->state.statically_allocated);
784
            break;
785
        default:
786
            Py_UNREACHABLE();
787
    }
788
#endif
789
790
0
    return 1;
791
792
0
#undef CHECK
793
0
}
794
795
static PyObject*
796
unicode_result(PyObject *unicode)
797
44.1M
{
798
44.1M
    assert(_PyUnicode_CHECK(unicode));
799
800
44.1M
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
801
44.1M
    if (length == 0) {
802
195
        PyObject *empty = unicode_get_empty();
803
195
        if (unicode != empty) {
804
0
            Py_DECREF(unicode);
805
0
        }
806
195
        return empty;
807
195
    }
808
809
44.1M
    if (length == 1) {
810
257k
        int kind = PyUnicode_KIND(unicode);
811
257k
        if (kind == PyUnicode_1BYTE_KIND) {
812
87.5k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
813
87.5k
            Py_UCS1 ch = data[0];
814
87.5k
            PyObject *latin1_char = LATIN1(ch);
815
87.5k
            if (unicode != latin1_char) {
816
81.6k
                Py_DECREF(unicode);
817
81.6k
            }
818
87.5k
            return latin1_char;
819
87.5k
        }
820
257k
    }
821
822
44.0M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
823
44.0M
    return unicode;
824
44.1M
}
825
826
static PyObject*
827
unicode_result_unchanged(PyObject *unicode)
828
131M
{
829
131M
    if (PyUnicode_CheckExact(unicode)) {
830
128M
        return Py_NewRef(unicode);
831
128M
    }
832
3.09M
    else
833
        /* Subtype -- return genuine unicode string with the same value. */
834
3.09M
        return _PyUnicode_Copy(unicode);
835
131M
}
836
837
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
838
   ASCII, Latin1, UTF-8, etc. */
839
static char*
840
backslashreplace(_PyBytesWriter *writer, char *str,
841
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
842
0
{
843
0
    Py_ssize_t size, i;
844
0
    Py_UCS4 ch;
845
0
    int kind;
846
0
    const void *data;
847
848
0
    kind = PyUnicode_KIND(unicode);
849
0
    data = PyUnicode_DATA(unicode);
850
851
0
    size = 0;
852
    /* determine replacement size */
853
0
    for (i = collstart; i < collend; ++i) {
854
0
        Py_ssize_t incr;
855
856
0
        ch = PyUnicode_READ(kind, data, i);
857
0
        if (ch < 0x100)
858
0
            incr = 2+2;
859
0
        else if (ch < 0x10000)
860
0
            incr = 2+4;
861
0
        else {
862
0
            assert(ch <= MAX_UNICODE);
863
0
            incr = 2+8;
864
0
        }
865
0
        if (size > PY_SSIZE_T_MAX - incr) {
866
0
            PyErr_SetString(PyExc_OverflowError,
867
0
                            "encoded result is too long for a Python string");
868
0
            return NULL;
869
0
        }
870
0
        size += incr;
871
0
    }
872
873
0
    str = _PyBytesWriter_Prepare(writer, str, size);
874
0
    if (str == NULL)
875
0
        return NULL;
876
877
    /* generate replacement */
878
0
    for (i = collstart; i < collend; ++i) {
879
0
        ch = PyUnicode_READ(kind, data, i);
880
0
        *str++ = '\\';
881
0
        if (ch >= 0x00010000) {
882
0
            *str++ = 'U';
883
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
884
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
885
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
886
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
887
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
888
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
889
0
        }
890
0
        else if (ch >= 0x100) {
891
0
            *str++ = 'u';
892
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
893
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
894
0
        }
895
0
        else
896
0
            *str++ = 'x';
897
0
        *str++ = Py_hexdigits[(ch>>4)&0xf];
898
0
        *str++ = Py_hexdigits[ch&0xf];
899
0
    }
900
0
    return str;
901
0
}
902
903
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
904
   ASCII, Latin1, UTF-8, etc. */
905
static char*
906
xmlcharrefreplace(_PyBytesWriter *writer, char *str,
907
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
908
0
{
909
0
    Py_ssize_t size, i;
910
0
    Py_UCS4 ch;
911
0
    int kind;
912
0
    const void *data;
913
914
0
    kind = PyUnicode_KIND(unicode);
915
0
    data = PyUnicode_DATA(unicode);
916
917
0
    size = 0;
918
    /* determine replacement size */
919
0
    for (i = collstart; i < collend; ++i) {
920
0
        Py_ssize_t incr;
921
922
0
        ch = PyUnicode_READ(kind, data, i);
923
0
        if (ch < 10)
924
0
            incr = 2+1+1;
925
0
        else if (ch < 100)
926
0
            incr = 2+2+1;
927
0
        else if (ch < 1000)
928
0
            incr = 2+3+1;
929
0
        else if (ch < 10000)
930
0
            incr = 2+4+1;
931
0
        else if (ch < 100000)
932
0
            incr = 2+5+1;
933
0
        else if (ch < 1000000)
934
0
            incr = 2+6+1;
935
0
        else {
936
0
            assert(ch <= MAX_UNICODE);
937
0
            incr = 2+7+1;
938
0
        }
939
0
        if (size > PY_SSIZE_T_MAX - incr) {
940
0
            PyErr_SetString(PyExc_OverflowError,
941
0
                            "encoded result is too long for a Python string");
942
0
            return NULL;
943
0
        }
944
0
        size += incr;
945
0
    }
946
947
0
    str = _PyBytesWriter_Prepare(writer, str, size);
948
0
    if (str == NULL)
949
0
        return NULL;
950
951
    /* generate replacement */
952
0
    for (i = collstart; i < collend; ++i) {
953
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
954
0
        if (size < 0) {
955
0
            return NULL;
956
0
        }
957
0
        str += size;
958
0
    }
959
0
    return str;
960
0
}
961
962
/* --- Bloom Filters ----------------------------------------------------- */
963
964
/* stuff to implement simple "bloom filters" for Unicode characters.
965
   to keep things simple, we use a single bitmask, using the least 5
966
   bits from each unicode characters as the bit index. */
967
968
/* the linebreak mask is set up by _PyUnicode_Init() below */
969
970
#if LONG_BIT >= 128
971
#define BLOOM_WIDTH 128
972
#elif LONG_BIT >= 64
973
38.2M
#define BLOOM_WIDTH 64
974
#elif LONG_BIT >= 32
975
#define BLOOM_WIDTH 32
976
#else
977
#error "LONG_BIT is smaller than 32"
978
#endif
979
980
15.4M
#define BLOOM_MASK unsigned long
981
982
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
983
984
54.9M
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
985
986
#define BLOOM_LINEBREAK(ch)                                             \
987
218M
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
988
218M
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
989
990
static inline BLOOM_MASK
991
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
992
7.73M
{
993
7.73M
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
994
7.73M
    do {                                               \
995
7.73M
        TYPE *data = (TYPE *)PTR;                      \
996
7.73M
        TYPE *end = data + LEN;                        \
997
7.73M
        Py_UCS4 ch;                                    \
998
16.9M
        for (; data != end; data++) {                  \
999
9.18M
            ch = *data;                                \
1000
9.18M
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
1001
9.18M
        }                                              \
1002
7.73M
        break;                                         \
1003
7.73M
    } while (0)
1004
1005
    /* calculate simple bloom-style bitmask for a given unicode string */
1006
1007
7.73M
    BLOOM_MASK mask;
1008
1009
7.73M
    mask = 0;
1010
7.73M
    switch (kind) {
1011
7.73M
    case PyUnicode_1BYTE_KIND:
1012
7.73M
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
1013
0
        break;
1014
16
    case PyUnicode_2BYTE_KIND:
1015
16
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
1016
0
        break;
1017
0
    case PyUnicode_4BYTE_KIND:
1018
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
1019
0
        break;
1020
0
    default:
1021
0
        Py_UNREACHABLE();
1022
7.73M
    }
1023
7.73M
    return mask;
1024
1025
7.73M
#undef BLOOM_UPDATE
1026
7.73M
}
1027
1028
static int
1029
ensure_unicode(PyObject *obj)
1030
160M
{
1031
160M
    if (!PyUnicode_Check(obj)) {
1032
0
        PyErr_Format(PyExc_TypeError,
1033
0
                     "must be str, not %.100s",
1034
0
                     Py_TYPE(obj)->tp_name);
1035
0
        return -1;
1036
0
    }
1037
160M
    return 0;
1038
160M
}
1039
1040
/* Compilation of templated routines */
1041
1042
947k
#define STRINGLIB_GET_EMPTY() unicode_get_empty()
1043
1044
#include "stringlib/asciilib.h"
1045
#include "stringlib/fastsearch.h"
1046
#include "stringlib/partition.h"
1047
#include "stringlib/split.h"
1048
#include "stringlib/count.h"
1049
#include "stringlib/find.h"
1050
#include "stringlib/find_max_char.h"
1051
#include "stringlib/undef.h"
1052
1053
#include "stringlib/ucs1lib.h"
1054
#include "stringlib/fastsearch.h"
1055
#include "stringlib/partition.h"
1056
#include "stringlib/split.h"
1057
#include "stringlib/count.h"
1058
#include "stringlib/find.h"
1059
#include "stringlib/replace.h"
1060
#include "stringlib/repr.h"
1061
#include "stringlib/find_max_char.h"
1062
#include "stringlib/undef.h"
1063
1064
#include "stringlib/ucs2lib.h"
1065
#include "stringlib/fastsearch.h"
1066
#include "stringlib/partition.h"
1067
#include "stringlib/split.h"
1068
#include "stringlib/count.h"
1069
#include "stringlib/find.h"
1070
#include "stringlib/replace.h"
1071
#include "stringlib/repr.h"
1072
#include "stringlib/find_max_char.h"
1073
#include "stringlib/undef.h"
1074
1075
#include "stringlib/ucs4lib.h"
1076
#include "stringlib/fastsearch.h"
1077
#include "stringlib/partition.h"
1078
#include "stringlib/split.h"
1079
#include "stringlib/count.h"
1080
#include "stringlib/find.h"
1081
#include "stringlib/replace.h"
1082
#include "stringlib/repr.h"
1083
#include "stringlib/find_max_char.h"
1084
#include "stringlib/undef.h"
1085
1086
#undef STRINGLIB_GET_EMPTY
1087
1088
/* --- Unicode Object ----------------------------------------------------- */
1089
1090
static inline Py_ssize_t
1091
findchar(const void *s, int kind,
1092
         Py_ssize_t size, Py_UCS4 ch,
1093
         int direction)
1094
102M
{
1095
102M
    switch (kind) {
1096
95.6M
    case PyUnicode_1BYTE_KIND:
1097
95.6M
        if ((Py_UCS1) ch != ch)
1098
3.42k
            return -1;
1099
95.6M
        if (direction > 0)
1100
95.6M
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1101
8.03k
        else
1102
8.03k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1103
5.99M
    case PyUnicode_2BYTE_KIND:
1104
5.99M
        if ((Py_UCS2) ch != ch)
1105
0
            return -1;
1106
5.99M
        if (direction > 0)
1107
5.95M
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1108
35.1k
        else
1109
35.1k
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1110
777k
    case PyUnicode_4BYTE_KIND:
1111
777k
        if (direction > 0)
1112
625k
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1113
151k
        else
1114
151k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1115
0
    default:
1116
0
        Py_UNREACHABLE();
1117
102M
    }
1118
102M
}
1119
1120
#ifdef Py_DEBUG
1121
/* Fill the data of a Unicode string with invalid characters to detect bugs
1122
   earlier.
1123
1124
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1125
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1126
   invalid character in Unicode 6.0. */
1127
static void
1128
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1129
{
1130
    int kind = PyUnicode_KIND(unicode);
1131
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1132
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1133
    if (length <= old_length)
1134
        return;
1135
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1136
}
1137
#endif
1138
1139
static PyObject*
1140
resize_copy(PyObject *unicode, Py_ssize_t length)
1141
0
{
1142
0
    Py_ssize_t copy_length;
1143
0
    PyObject *copy;
1144
1145
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1146
0
    if (copy == NULL)
1147
0
        return NULL;
1148
1149
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1150
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1151
0
    return copy;
1152
0
}
1153
1154
static PyObject*
1155
resize_compact(PyObject *unicode, Py_ssize_t length)
1156
54.0M
{
1157
54.0M
    Py_ssize_t char_size;
1158
54.0M
    Py_ssize_t struct_size;
1159
54.0M
    Py_ssize_t new_size;
1160
54.0M
    PyObject *new_unicode;
1161
#ifdef Py_DEBUG
1162
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1163
#endif
1164
1165
54.0M
    if (!unicode_modifiable(unicode)) {
1166
0
        PyObject *copy = resize_copy(unicode, length);
1167
0
        if (copy == NULL) {
1168
0
            return NULL;
1169
0
        }
1170
0
        Py_DECREF(unicode);
1171
0
        return copy;
1172
0
    }
1173
54.0M
    assert(PyUnicode_IS_COMPACT(unicode));
1174
1175
54.0M
    char_size = PyUnicode_KIND(unicode);
1176
54.0M
    if (PyUnicode_IS_ASCII(unicode))
1177
46.9M
        struct_size = sizeof(PyASCIIObject);
1178
7.16M
    else
1179
7.16M
        struct_size = sizeof(PyCompactUnicodeObject);
1180
1181
54.0M
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1182
0
        PyErr_NoMemory();
1183
0
        return NULL;
1184
0
    }
1185
54.0M
    new_size = (struct_size + (length + 1) * char_size);
1186
1187
54.0M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1188
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1189
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1190
0
        PyUnicode_SET_UTF8(unicode, NULL);
1191
0
    }
1192
#ifdef Py_TRACE_REFS
1193
    _Py_ForgetReference(unicode);
1194
#endif
1195
54.0M
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1196
1197
54.0M
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1198
54.0M
    if (new_unicode == NULL) {
1199
0
        _Py_NewReferenceNoTotal(unicode);
1200
0
        PyErr_NoMemory();
1201
0
        return NULL;
1202
0
    }
1203
54.0M
    unicode = new_unicode;
1204
54.0M
    _Py_NewReferenceNoTotal(unicode);
1205
1206
54.0M
    _PyUnicode_LENGTH(unicode) = length;
1207
#ifdef Py_DEBUG
1208
    unicode_fill_invalid(unicode, old_length);
1209
#endif
1210
54.0M
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1211
54.0M
                    length, 0);
1212
54.0M
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1213
54.0M
    return unicode;
1214
54.0M
}
1215
1216
static int
1217
resize_inplace(PyObject *unicode, Py_ssize_t length)
1218
0
{
1219
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1220
0
    assert(Py_REFCNT(unicode) == 1);
1221
1222
0
    Py_ssize_t new_size;
1223
0
    Py_ssize_t char_size;
1224
0
    int share_utf8;
1225
0
    void *data;
1226
#ifdef Py_DEBUG
1227
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1228
#endif
1229
1230
0
    data = _PyUnicode_DATA_ANY(unicode);
1231
0
    char_size = PyUnicode_KIND(unicode);
1232
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1233
1234
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1235
0
        PyErr_NoMemory();
1236
0
        return -1;
1237
0
    }
1238
0
    new_size = (length + 1) * char_size;
1239
1240
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1241
0
    {
1242
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1243
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1244
0
        PyUnicode_SET_UTF8(unicode, NULL);
1245
0
    }
1246
1247
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1248
0
    if (data == NULL) {
1249
0
        PyErr_NoMemory();
1250
0
        return -1;
1251
0
    }
1252
0
    _PyUnicode_DATA_ANY(unicode) = data;
1253
0
    if (share_utf8) {
1254
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1255
0
        PyUnicode_SET_UTF8(unicode, data);
1256
0
    }
1257
0
    _PyUnicode_LENGTH(unicode) = length;
1258
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1259
#ifdef Py_DEBUG
1260
    unicode_fill_invalid(unicode, old_length);
1261
#endif
1262
1263
    /* check for integer overflow */
1264
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1265
0
        PyErr_NoMemory();
1266
0
        return -1;
1267
0
    }
1268
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1269
0
    return 0;
1270
0
}
1271
1272
static const char*
1273
unicode_kind_name(PyObject *unicode)
1274
0
{
1275
    /* don't check consistency: unicode_kind_name() is called from
1276
       _PyUnicode_Dump() */
1277
0
    if (!PyUnicode_IS_COMPACT(unicode))
1278
0
    {
1279
0
        switch (PyUnicode_KIND(unicode))
1280
0
        {
1281
0
        case PyUnicode_1BYTE_KIND:
1282
0
            if (PyUnicode_IS_ASCII(unicode))
1283
0
                return "legacy ascii";
1284
0
            else
1285
0
                return "legacy latin1";
1286
0
        case PyUnicode_2BYTE_KIND:
1287
0
            return "legacy UCS2";
1288
0
        case PyUnicode_4BYTE_KIND:
1289
0
            return "legacy UCS4";
1290
0
        default:
1291
0
            return "<legacy invalid kind>";
1292
0
        }
1293
0
    }
1294
0
    switch (PyUnicode_KIND(unicode)) {
1295
0
    case PyUnicode_1BYTE_KIND:
1296
0
        if (PyUnicode_IS_ASCII(unicode))
1297
0
            return "ascii";
1298
0
        else
1299
0
            return "latin1";
1300
0
    case PyUnicode_2BYTE_KIND:
1301
0
        return "UCS2";
1302
0
    case PyUnicode_4BYTE_KIND:
1303
0
        return "UCS4";
1304
0
    default:
1305
0
        return "<invalid compact kind>";
1306
0
    }
1307
0
}
1308
1309
#ifdef Py_DEBUG
1310
/* Functions wrapping macros for use in debugger */
1311
const char *_PyUnicode_utf8(void *unicode_raw){
1312
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1313
    return PyUnicode_UTF8(unicode);
1314
}
1315
1316
const void *_PyUnicode_compact_data(void *unicode_raw) {
1317
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1318
    return _PyUnicode_COMPACT_DATA(unicode);
1319
}
1320
const void *_PyUnicode_data(void *unicode_raw) {
1321
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1322
    printf("obj %p\n", (void*)unicode);
1323
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1324
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1325
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1326
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1327
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1328
    return PyUnicode_DATA(unicode);
1329
}
1330
1331
void
1332
_PyUnicode_Dump(PyObject *op)
1333
{
1334
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1335
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1336
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1337
    const void *data;
1338
1339
    if (ascii->state.compact)
1340
    {
1341
        if (ascii->state.ascii)
1342
            data = (ascii + 1);
1343
        else
1344
            data = (compact + 1);
1345
    }
1346
    else
1347
        data = unicode->data.any;
1348
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1349
1350
    if (!ascii->state.ascii) {
1351
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1352
    }
1353
    printf(", data=%p\n", data);
1354
}
1355
#endif
1356
1357
1358
PyObject *
1359
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1360
442M
{
1361
    /* Optimization for empty strings */
1362
442M
    if (size == 0) {
1363
24.4M
        return unicode_get_empty();
1364
24.4M
    }
1365
1366
417M
    PyObject *obj;
1367
417M
    PyCompactUnicodeObject *unicode;
1368
417M
    void *data;
1369
417M
    int kind;
1370
417M
    int is_ascii;
1371
417M
    Py_ssize_t char_size;
1372
417M
    Py_ssize_t struct_size;
1373
1374
417M
    is_ascii = 0;
1375
417M
    struct_size = sizeof(PyCompactUnicodeObject);
1376
417M
    if (maxchar < 128) {
1377
254M
        kind = PyUnicode_1BYTE_KIND;
1378
254M
        char_size = 1;
1379
254M
        is_ascii = 1;
1380
254M
        struct_size = sizeof(PyASCIIObject);
1381
254M
    }
1382
163M
    else if (maxchar < 256) {
1383
19.5M
        kind = PyUnicode_1BYTE_KIND;
1384
19.5M
        char_size = 1;
1385
19.5M
    }
1386
143M
    else if (maxchar < 65536) {
1387
138M
        kind = PyUnicode_2BYTE_KIND;
1388
138M
        char_size = 2;
1389
138M
    }
1390
5.00M
    else {
1391
5.00M
        if (maxchar > MAX_UNICODE) {
1392
0
            PyErr_SetString(PyExc_SystemError,
1393
0
                            "invalid maximum character passed to PyUnicode_New");
1394
0
            return NULL;
1395
0
        }
1396
5.00M
        kind = PyUnicode_4BYTE_KIND;
1397
5.00M
        char_size = 4;
1398
5.00M
    }
1399
1400
    /* Ensure we won't overflow the size. */
1401
417M
    if (size < 0) {
1402
0
        PyErr_SetString(PyExc_SystemError,
1403
0
                        "Negative size passed to PyUnicode_New");
1404
0
        return NULL;
1405
0
    }
1406
417M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1407
0
        return PyErr_NoMemory();
1408
1409
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1410
     * PyObject_New() so we are able to allocate space for the object and
1411
     * it's data buffer.
1412
     */
1413
417M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1414
417M
    if (obj == NULL) {
1415
0
        return PyErr_NoMemory();
1416
0
    }
1417
417M
    _PyObject_Init(obj, &PyUnicode_Type);
1418
1419
417M
    unicode = (PyCompactUnicodeObject *)obj;
1420
417M
    if (is_ascii)
1421
254M
        data = ((PyASCIIObject*)obj) + 1;
1422
163M
    else
1423
163M
        data = unicode + 1;
1424
417M
    _PyUnicode_LENGTH(unicode) = size;
1425
417M
    _PyUnicode_HASH(unicode) = -1;
1426
417M
    _PyUnicode_STATE(unicode).interned = 0;
1427
417M
    _PyUnicode_STATE(unicode).kind = kind;
1428
417M
    _PyUnicode_STATE(unicode).compact = 1;
1429
417M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1430
417M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1431
417M
    if (is_ascii) {
1432
254M
        ((char*)data)[size] = 0;
1433
254M
    }
1434
163M
    else if (kind == PyUnicode_1BYTE_KIND) {
1435
19.5M
        ((char*)data)[size] = 0;
1436
19.5M
        unicode->utf8 = NULL;
1437
19.5M
        unicode->utf8_length = 0;
1438
19.5M
    }
1439
143M
    else {
1440
143M
        unicode->utf8 = NULL;
1441
143M
        unicode->utf8_length = 0;
1442
143M
        if (kind == PyUnicode_2BYTE_KIND)
1443
138M
            ((Py_UCS2*)data)[size] = 0;
1444
5.00M
        else /* kind == PyUnicode_4BYTE_KIND */
1445
5.00M
            ((Py_UCS4*)data)[size] = 0;
1446
143M
    }
1447
#ifdef Py_DEBUG
1448
    unicode_fill_invalid((PyObject*)unicode, 0);
1449
#endif
1450
417M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1451
417M
    return obj;
1452
417M
}
1453
1454
static int
1455
unicode_check_modifiable(PyObject *unicode)
1456
672
{
1457
672
    if (!unicode_modifiable(unicode)) {
1458
0
        PyErr_SetString(PyExc_SystemError,
1459
0
                        "Cannot modify a string currently used");
1460
0
        return -1;
1461
0
    }
1462
672
    return 0;
1463
672
}
1464
1465
static int
1466
_copy_characters(PyObject *to, Py_ssize_t to_start,
1467
                 PyObject *from, Py_ssize_t from_start,
1468
                 Py_ssize_t how_many, int check_maxchar)
1469
249M
{
1470
249M
    int from_kind, to_kind;
1471
249M
    const void *from_data;
1472
249M
    void *to_data;
1473
1474
249M
    assert(0 <= how_many);
1475
249M
    assert(0 <= from_start);
1476
249M
    assert(0 <= to_start);
1477
249M
    assert(PyUnicode_Check(from));
1478
249M
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1479
1480
249M
    assert(to == NULL || PyUnicode_Check(to));
1481
1482
249M
    if (how_many == 0) {
1483
310k
        return 0;
1484
310k
    }
1485
1486
249M
    assert(to != NULL);
1487
249M
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1488
1489
249M
    from_kind = PyUnicode_KIND(from);
1490
249M
    from_data = PyUnicode_DATA(from);
1491
249M
    to_kind = PyUnicode_KIND(to);
1492
249M
    to_data = PyUnicode_DATA(to);
1493
1494
#ifdef Py_DEBUG
1495
    if (!check_maxchar
1496
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1497
    {
1498
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1499
        Py_UCS4 ch;
1500
        Py_ssize_t i;
1501
        for (i=0; i < how_many; i++) {
1502
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1503
            assert(ch <= to_maxchar);
1504
        }
1505
    }
1506
#endif
1507
1508
249M
    if (from_kind == to_kind) {
1509
153M
        if (check_maxchar
1510
153M
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1511
0
        {
1512
            /* Writing Latin-1 characters into an ASCII string requires to
1513
               check that all written characters are pure ASCII */
1514
0
            Py_UCS4 max_char;
1515
0
            max_char = ucs1lib_find_max_char(from_data,
1516
0
                                             (const Py_UCS1*)from_data + how_many);
1517
0
            if (max_char >= 128)
1518
0
                return -1;
1519
0
        }
1520
153M
        memcpy((char*)to_data + to_kind * to_start,
1521
153M
                  (const char*)from_data + from_kind * from_start,
1522
153M
                  to_kind * how_many);
1523
153M
    }
1524
96.0M
    else if (from_kind == PyUnicode_1BYTE_KIND
1525
96.0M
             && to_kind == PyUnicode_2BYTE_KIND)
1526
80.1M
    {
1527
80.1M
        _PyUnicode_CONVERT_BYTES(
1528
80.1M
            Py_UCS1, Py_UCS2,
1529
80.1M
            PyUnicode_1BYTE_DATA(from) + from_start,
1530
80.1M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1531
80.1M
            PyUnicode_2BYTE_DATA(to) + to_start
1532
80.1M
            );
1533
80.1M
    }
1534
15.9M
    else if (from_kind == PyUnicode_1BYTE_KIND
1535
15.9M
             && to_kind == PyUnicode_4BYTE_KIND)
1536
13.8M
    {
1537
13.8M
        _PyUnicode_CONVERT_BYTES(
1538
13.8M
            Py_UCS1, Py_UCS4,
1539
13.8M
            PyUnicode_1BYTE_DATA(from) + from_start,
1540
13.8M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1541
13.8M
            PyUnicode_4BYTE_DATA(to) + to_start
1542
13.8M
            );
1543
13.8M
    }
1544
2.06M
    else if (from_kind == PyUnicode_2BYTE_KIND
1545
2.06M
             && to_kind == PyUnicode_4BYTE_KIND)
1546
2.03M
    {
1547
2.03M
        _PyUnicode_CONVERT_BYTES(
1548
2.03M
            Py_UCS2, Py_UCS4,
1549
2.03M
            PyUnicode_2BYTE_DATA(from) + from_start,
1550
2.03M
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1551
2.03M
            PyUnicode_4BYTE_DATA(to) + to_start
1552
2.03M
            );
1553
2.03M
    }
1554
27.0k
    else {
1555
27.0k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1556
1557
27.0k
        if (!check_maxchar) {
1558
27.0k
            if (from_kind == PyUnicode_2BYTE_KIND
1559
27.0k
                && to_kind == PyUnicode_1BYTE_KIND)
1560
1.90k
            {
1561
1.90k
                _PyUnicode_CONVERT_BYTES(
1562
1.90k
                    Py_UCS2, Py_UCS1,
1563
1.90k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1564
1.90k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1565
1.90k
                    PyUnicode_1BYTE_DATA(to) + to_start
1566
1.90k
                    );
1567
1.90k
            }
1568
25.1k
            else if (from_kind == PyUnicode_4BYTE_KIND
1569
25.1k
                     && to_kind == PyUnicode_1BYTE_KIND)
1570
9.14k
            {
1571
9.14k
                _PyUnicode_CONVERT_BYTES(
1572
9.14k
                    Py_UCS4, Py_UCS1,
1573
9.14k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1574
9.14k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1575
9.14k
                    PyUnicode_1BYTE_DATA(to) + to_start
1576
9.14k
                    );
1577
9.14k
            }
1578
16.0k
            else if (from_kind == PyUnicode_4BYTE_KIND
1579
16.0k
                     && to_kind == PyUnicode_2BYTE_KIND)
1580
16.0k
            {
1581
16.0k
                _PyUnicode_CONVERT_BYTES(
1582
16.0k
                    Py_UCS4, Py_UCS2,
1583
16.0k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1584
16.0k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1585
16.0k
                    PyUnicode_2BYTE_DATA(to) + to_start
1586
16.0k
                    );
1587
16.0k
            }
1588
0
            else {
1589
0
                Py_UNREACHABLE();
1590
0
            }
1591
27.0k
        }
1592
0
        else {
1593
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1594
0
            Py_UCS4 ch;
1595
0
            Py_ssize_t i;
1596
1597
0
            for (i=0; i < how_many; i++) {
1598
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1599
0
                if (ch > to_maxchar)
1600
0
                    return -1;
1601
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1602
0
            }
1603
0
        }
1604
27.0k
    }
1605
249M
    return 0;
1606
249M
}
1607
1608
void
1609
_PyUnicode_FastCopyCharacters(
1610
    PyObject *to, Py_ssize_t to_start,
1611
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1612
249M
{
1613
249M
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1614
249M
}
1615
1616
Py_ssize_t
1617
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1618
                         PyObject *from, Py_ssize_t from_start,
1619
                         Py_ssize_t how_many)
1620
0
{
1621
0
    int err;
1622
1623
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1624
0
        PyErr_BadInternalCall();
1625
0
        return -1;
1626
0
    }
1627
1628
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1629
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1630
0
        return -1;
1631
0
    }
1632
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1633
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1634
0
        return -1;
1635
0
    }
1636
0
    if (how_many < 0) {
1637
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1638
0
        return -1;
1639
0
    }
1640
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1641
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1642
0
        PyErr_Format(PyExc_SystemError,
1643
0
                     "Cannot write %zi characters at %zi "
1644
0
                     "in a string of %zi characters",
1645
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1646
0
        return -1;
1647
0
    }
1648
1649
0
    if (how_many == 0)
1650
0
        return 0;
1651
1652
0
    if (unicode_check_modifiable(to))
1653
0
        return -1;
1654
1655
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1656
0
    if (err) {
1657
0
        PyErr_Format(PyExc_SystemError,
1658
0
                     "Cannot copy %s characters "
1659
0
                     "into a string of %s characters",
1660
0
                     unicode_kind_name(from),
1661
0
                     unicode_kind_name(to));
1662
0
        return -1;
1663
0
    }
1664
0
    return how_many;
1665
0
}
1666
1667
/* Find the maximum code point and count the number of surrogate pairs so a
1668
   correct string length can be computed before converting a string to UCS4.
1669
   This function counts single surrogates as a character and not as a pair.
1670
1671
   Return 0 on success, or -1 on error. */
1672
static int
1673
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1674
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1675
17.9k
{
1676
17.9k
    const wchar_t *iter;
1677
17.9k
    Py_UCS4 ch;
1678
1679
17.9k
    assert(num_surrogates != NULL && maxchar != NULL);
1680
17.9k
    *num_surrogates = 0;
1681
17.9k
    *maxchar = 0;
1682
1683
401k
    for (iter = begin; iter < end; ) {
1684
#if SIZEOF_WCHAR_T == 2
1685
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1686
            && (iter+1) < end
1687
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1688
        {
1689
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1690
            ++(*num_surrogates);
1691
            iter += 2;
1692
        }
1693
        else
1694
#endif
1695
383k
        {
1696
383k
            ch = *iter;
1697
383k
            iter++;
1698
383k
        }
1699
383k
        if (ch > *maxchar) {
1700
78.4k
            *maxchar = ch;
1701
78.4k
            if (*maxchar > MAX_UNICODE) {
1702
0
                PyErr_Format(PyExc_ValueError,
1703
0
                             "character U+%x is not in range [U+0000; U+%x]",
1704
0
                             ch, MAX_UNICODE);
1705
0
                return -1;
1706
0
            }
1707
78.4k
        }
1708
383k
    }
1709
17.9k
    return 0;
1710
17.9k
}
1711
1712
static void
1713
unicode_dealloc(PyObject *unicode)
1714
427M
{
1715
#ifdef Py_DEBUG
1716
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1717
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1718
    }
1719
#endif
1720
427M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1721
        /* This should never get called, but we also don't want to SEGV if
1722
        * we accidentally decref an immortal string out of existence. Since
1723
        * the string is an immortal object, just re-set the reference count.
1724
        */
1725
#ifdef Py_DEBUG
1726
        Py_UNREACHABLE();
1727
#endif
1728
0
        _Py_SetImmortal(unicode);
1729
0
        return;
1730
0
    }
1731
427M
    switch (_PyUnicode_STATE(unicode).interned) {
1732
427M
        case SSTATE_NOT_INTERNED:
1733
427M
            break;
1734
456k
        case SSTATE_INTERNED_MORTAL:
1735
            /* Remove the object from the intern dict.
1736
             * Before doing so, we set the refcount to 2: the key and value
1737
             * in the interned_dict.
1738
             */
1739
456k
            assert(Py_REFCNT(unicode) == 0);
1740
456k
            Py_SET_REFCNT(unicode, 2);
1741
#ifdef Py_REF_DEBUG
1742
            /* let's be pedantic with the ref total */
1743
            _Py_IncRefTotal(_PyThreadState_GET());
1744
            _Py_IncRefTotal(_PyThreadState_GET());
1745
#endif
1746
456k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1747
456k
            PyObject *interned = get_interned_dict(interp);
1748
456k
            assert(interned != NULL);
1749
456k
            PyObject *popped;
1750
456k
            int r = PyDict_Pop(interned, unicode, &popped);
1751
456k
            if (r == -1) {
1752
0
                PyErr_FormatUnraisable("Exception ignored while "
1753
0
                                       "removing an interned string %R",
1754
0
                                       unicode);
1755
                // We don't know what happened to the string. It's probably
1756
                // best to leak it:
1757
                // - if it was popped, there are no more references to it
1758
                //   so it can't cause trouble (except wasted memory)
1759
                // - if it wasn't popped, it'll remain interned
1760
0
                _Py_SetImmortal(unicode);
1761
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1762
0
                return;
1763
0
            }
1764
456k
            if (r == 0) {
1765
                // The interned string was not found in the interned_dict.
1766
#ifdef Py_DEBUG
1767
                Py_UNREACHABLE();
1768
#endif
1769
0
                _Py_SetImmortal(unicode);
1770
0
                return;
1771
0
            }
1772
            // Successfully popped.
1773
456k
            assert(popped == unicode);
1774
            // Only our `popped` reference should be left; remove it too.
1775
456k
            assert(Py_REFCNT(unicode) == 1);
1776
456k
            Py_SET_REFCNT(unicode, 0);
1777
#ifdef Py_REF_DEBUG
1778
            /* let's be pedantic with the ref total */
1779
            _Py_DecRefTotal(_PyThreadState_GET());
1780
#endif
1781
456k
            break;
1782
0
        default:
1783
            // As with `statically_allocated` above.
1784
#ifdef Py_REF_DEBUG
1785
            Py_UNREACHABLE();
1786
#endif
1787
0
            _Py_SetImmortal(unicode);
1788
0
            return;
1789
427M
    }
1790
427M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1791
121k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1792
121k
    }
1793
427M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1794
9.98M
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1795
9.98M
    }
1796
1797
427M
    Py_TYPE(unicode)->tp_free(unicode);
1798
427M
}
1799
1800
#ifdef Py_DEBUG
1801
static int
1802
unicode_is_singleton(PyObject *unicode)
1803
{
1804
    if (unicode == &_Py_STR(empty)) {
1805
        return 1;
1806
    }
1807
1808
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1809
    if (ascii->length == 1) {
1810
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1811
        if (ch < 256 && LATIN1(ch) == unicode) {
1812
            return 1;
1813
        }
1814
    }
1815
    return 0;
1816
}
1817
#endif
1818
1819
static int
1820
unicode_modifiable(PyObject *unicode)
1821
55.4M
{
1822
55.4M
    assert(_PyUnicode_CHECK(unicode));
1823
55.4M
    if (!_PyObject_IsUniquelyReferenced(unicode))
1824
47.8k
        return 0;
1825
55.3M
    if (PyUnicode_HASH(unicode) != -1)
1826
0
        return 0;
1827
55.3M
    if (PyUnicode_CHECK_INTERNED(unicode))
1828
0
        return 0;
1829
55.3M
    if (!PyUnicode_CheckExact(unicode))
1830
0
        return 0;
1831
#ifdef Py_DEBUG
1832
    /* singleton refcount is greater than 1 */
1833
    assert(!unicode_is_singleton(unicode));
1834
#endif
1835
55.3M
    return 1;
1836
55.3M
}
1837
1838
static int
1839
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1840
652k
{
1841
652k
    PyObject *unicode;
1842
652k
    Py_ssize_t old_length;
1843
1844
652k
    assert(p_unicode != NULL);
1845
652k
    unicode = *p_unicode;
1846
1847
652k
    assert(unicode != NULL);
1848
652k
    assert(PyUnicode_Check(unicode));
1849
652k
    assert(0 <= length);
1850
1851
652k
    old_length = PyUnicode_GET_LENGTH(unicode);
1852
652k
    if (old_length == length)
1853
0
        return 0;
1854
1855
652k
    if (length == 0) {
1856
0
        PyObject *empty = unicode_get_empty();
1857
0
        Py_SETREF(*p_unicode, empty);
1858
0
        return 0;
1859
0
    }
1860
1861
652k
    if (!unicode_modifiable(unicode)) {
1862
0
        PyObject *copy = resize_copy(unicode, length);
1863
0
        if (copy == NULL)
1864
0
            return -1;
1865
0
        Py_SETREF(*p_unicode, copy);
1866
0
        return 0;
1867
0
    }
1868
1869
652k
    if (PyUnicode_IS_COMPACT(unicode)) {
1870
652k
        PyObject *new_unicode = resize_compact(unicode, length);
1871
652k
        if (new_unicode == NULL)
1872
0
            return -1;
1873
652k
        *p_unicode = new_unicode;
1874
652k
        return 0;
1875
652k
    }
1876
0
    return resize_inplace(unicode, length);
1877
652k
}
1878
1879
int
1880
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1881
0
{
1882
0
    PyObject *unicode;
1883
0
    if (p_unicode == NULL) {
1884
0
        PyErr_BadInternalCall();
1885
0
        return -1;
1886
0
    }
1887
0
    unicode = *p_unicode;
1888
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1889
0
    {
1890
0
        PyErr_BadInternalCall();
1891
0
        return -1;
1892
0
    }
1893
0
    return unicode_resize(p_unicode, length);
1894
0
}
1895
1896
/* Copy an ASCII or latin1 char* string into a Python Unicode string.
1897
1898
   WARNING: The function doesn't copy the terminating null character and
1899
   doesn't check the maximum character (may write a latin1 character in an
1900
   ASCII string). */
1901
static void
1902
unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1903
                   const char *str, Py_ssize_t len)
1904
0
{
1905
0
    int kind = PyUnicode_KIND(unicode);
1906
0
    const void *data = PyUnicode_DATA(unicode);
1907
0
    const char *end = str + len;
1908
1909
0
    assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1910
0
    switch (kind) {
1911
0
    case PyUnicode_1BYTE_KIND: {
1912
#ifdef Py_DEBUG
1913
        if (PyUnicode_IS_ASCII(unicode)) {
1914
            Py_UCS4 maxchar = ucs1lib_find_max_char(
1915
                (const Py_UCS1*)str,
1916
                (const Py_UCS1*)str + len);
1917
            assert(maxchar < 128);
1918
        }
1919
#endif
1920
0
        memcpy((char *) data + index, str, len);
1921
0
        break;
1922
0
    }
1923
0
    case PyUnicode_2BYTE_KIND: {
1924
0
        Py_UCS2 *start = (Py_UCS2 *)data + index;
1925
0
        Py_UCS2 *ucs2 = start;
1926
1927
0
        for (; str < end; ++ucs2, ++str)
1928
0
            *ucs2 = (Py_UCS2)*str;
1929
1930
0
        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1931
0
        break;
1932
0
    }
1933
0
    case PyUnicode_4BYTE_KIND: {
1934
0
        Py_UCS4 *start = (Py_UCS4 *)data + index;
1935
0
        Py_UCS4 *ucs4 = start;
1936
1937
0
        for (; str < end; ++ucs4, ++str)
1938
0
            *ucs4 = (Py_UCS4)*str;
1939
1940
0
        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1941
0
        break;
1942
0
    }
1943
0
    default:
1944
0
        Py_UNREACHABLE();
1945
0
    }
1946
0
}
1947
1948
static PyObject*
1949
get_latin1_char(Py_UCS1 ch)
1950
221M
{
1951
221M
    PyObject *o = LATIN1(ch);
1952
221M
    return o;
1953
221M
}
1954
1955
static PyObject*
1956
unicode_char(Py_UCS4 ch)
1957
257M
{
1958
257M
    PyObject *unicode;
1959
1960
257M
    assert(ch <= MAX_UNICODE);
1961
1962
257M
    if (ch < 256) {
1963
167M
        return get_latin1_char(ch);
1964
167M
    }
1965
1966
89.9M
    unicode = PyUnicode_New(1, ch);
1967
89.9M
    if (unicode == NULL)
1968
0
        return NULL;
1969
1970
89.9M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1971
89.9M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1972
87.5M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1973
87.5M
    } else {
1974
2.42M
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1975
2.42M
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1976
2.42M
    }
1977
89.9M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1978
89.9M
    return unicode;
1979
89.9M
}
1980
1981
1982
static inline void
1983
unicode_write_widechar(int kind, void *data,
1984
                       const wchar_t *u, Py_ssize_t size,
1985
                       Py_ssize_t num_surrogates)
1986
17.9k
{
1987
17.9k
    switch (kind) {
1988
17.9k
    case PyUnicode_1BYTE_KIND:
1989
17.9k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1990
17.9k
        break;
1991
1992
0
    case PyUnicode_2BYTE_KIND:
1993
#if SIZEOF_WCHAR_T == 2
1994
        memcpy(data, u, size * 2);
1995
#else
1996
0
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1997
0
#endif
1998
0
        break;
1999
2000
0
    case PyUnicode_4BYTE_KIND:
2001
0
    {
2002
#if SIZEOF_WCHAR_T == 2
2003
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
2004
        // surrogate pairs.
2005
        const wchar_t *end = u + size;
2006
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
2007
#  ifndef NDEBUG
2008
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
2009
#  endif
2010
        for (const wchar_t *iter = u; iter < end; ) {
2011
            assert(ucs4_out < ucs4_end);
2012
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
2013
                && (iter+1) < end
2014
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
2015
            {
2016
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
2017
                iter += 2;
2018
            }
2019
            else {
2020
                *ucs4_out++ = *iter;
2021
                iter++;
2022
            }
2023
        }
2024
        assert(ucs4_out == ucs4_end);
2025
#else
2026
0
        assert(num_surrogates == 0);
2027
0
        memcpy(data, u, size * 4);
2028
0
#endif
2029
0
        break;
2030
0
    }
2031
0
    default:
2032
0
        Py_UNREACHABLE();
2033
17.9k
    }
2034
17.9k
}
2035
2036
2037
PyObject *
2038
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2039
17.9k
{
2040
17.9k
    PyObject *unicode;
2041
17.9k
    Py_UCS4 maxchar = 0;
2042
17.9k
    Py_ssize_t num_surrogates;
2043
2044
17.9k
    if (u == NULL && size != 0) {
2045
0
        PyErr_BadInternalCall();
2046
0
        return NULL;
2047
0
    }
2048
2049
17.9k
    if (size == -1) {
2050
608
        size = wcslen(u);
2051
608
    }
2052
2053
    /* If the Unicode data is known at construction time, we can apply
2054
       some optimizations which share commonly used objects. */
2055
2056
    /* Optimization for empty strings */
2057
17.9k
    if (size == 0)
2058
32
        _Py_RETURN_UNICODE_EMPTY();
2059
2060
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2061
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
2062
       non-Unicode locales and hence needs conversion to UCS-4 first. */
2063
    if (_Py_LocaleUsesNonUnicodeWchar()) {
2064
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2065
        if (!converted) {
2066
            return NULL;
2067
        }
2068
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2069
        PyMem_Free(converted);
2070
        return unicode;
2071
    }
2072
#endif
2073
2074
    /* Single character Unicode objects in the Latin-1 range are
2075
       shared when using this constructor */
2076
17.9k
    if (size == 1 && (Py_UCS4)*u < 256)
2077
0
        return get_latin1_char((unsigned char)*u);
2078
2079
    /* If not empty and not single character, copy the Unicode data
2080
       into the new object */
2081
17.9k
    if (find_maxchar_surrogates(u, u + size,
2082
17.9k
                                &maxchar, &num_surrogates) == -1)
2083
0
        return NULL;
2084
2085
17.9k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
2086
17.9k
    if (!unicode)
2087
0
        return NULL;
2088
2089
17.9k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2090
17.9k
                           u, size, num_surrogates);
2091
2092
17.9k
    return unicode_result(unicode);
2093
17.9k
}
2094
2095
2096
int
2097
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
2098
                              const wchar_t *str,
2099
                              Py_ssize_t size)
2100
0
{
2101
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
2102
2103
0
    if (size < 0) {
2104
0
        size = wcslen(str);
2105
0
    }
2106
2107
0
    if (size == 0) {
2108
0
        return 0;
2109
0
    }
2110
2111
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2112
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
2113
       non-Unicode locales and hence needs conversion to UCS-4 first. */
2114
    if (_Py_LocaleUsesNonUnicodeWchar()) {
2115
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
2116
        if (!converted) {
2117
            return -1;
2118
        }
2119
2120
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
2121
        PyMem_Free(converted);
2122
        return res;
2123
    }
2124
#endif
2125
2126
0
    Py_UCS4 maxchar = 0;
2127
0
    Py_ssize_t num_surrogates;
2128
0
    if (find_maxchar_surrogates(str, str + size,
2129
0
                                &maxchar, &num_surrogates) == -1) {
2130
0
        return -1;
2131
0
    }
2132
2133
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
2134
0
        return -1;
2135
0
    }
2136
2137
0
    int kind = writer->kind;
2138
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2139
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
2140
2141
0
    writer->pos += size - num_surrogates;
2142
0
    return 0;
2143
0
}
2144
2145
2146
PyObject *
2147
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2148
546k
{
2149
546k
    if (size < 0) {
2150
0
        PyErr_SetString(PyExc_SystemError,
2151
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2152
0
        return NULL;
2153
0
    }
2154
546k
    if (u != NULL) {
2155
546k
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2156
546k
    }
2157
0
    if (size > 0) {
2158
0
        PyErr_SetString(PyExc_SystemError,
2159
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2160
0
        return NULL;
2161
0
    }
2162
0
    return unicode_get_empty();
2163
0
}
2164
2165
PyObject *
2166
PyUnicode_FromString(const char *u)
2167
5.77M
{
2168
5.77M
    size_t size = strlen(u);
2169
5.77M
    if (size > PY_SSIZE_T_MAX) {
2170
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2171
0
        return NULL;
2172
0
    }
2173
5.77M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2174
5.77M
}
2175
2176
2177
PyObject *
2178
_PyUnicode_FromId(_Py_Identifier *id)
2179
0
{
2180
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2181
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2182
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2183
2184
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2185
0
    if (index < 0) {
2186
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2187
2188
0
        PyMutex_Lock(&rt_ids->mutex);
2189
        // Check again to detect concurrent access. Another thread can have
2190
        // initialized the index while this thread waited for the lock.
2191
0
        index = _Py_atomic_load_ssize(&id->index);
2192
0
        if (index < 0) {
2193
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2194
0
            index = rt_ids->next_index;
2195
0
            rt_ids->next_index++;
2196
0
            _Py_atomic_store_ssize(&id->index, index);
2197
0
        }
2198
0
        PyMutex_Unlock(&rt_ids->mutex);
2199
0
    }
2200
0
    assert(index >= 0);
2201
2202
0
    PyObject *obj;
2203
0
    if (index < ids->size) {
2204
0
        obj = ids->array[index];
2205
0
        if (obj) {
2206
            // Return a borrowed reference
2207
0
            goto end;
2208
0
        }
2209
0
    }
2210
2211
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2212
0
                                       NULL, NULL);
2213
0
    if (!obj) {
2214
0
        goto end;
2215
0
    }
2216
0
    _PyUnicode_InternImmortal(interp, &obj);
2217
2218
0
    if (index >= ids->size) {
2219
        // Overallocate to reduce the number of realloc
2220
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2221
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2222
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2223
0
        if (new_array == NULL) {
2224
0
            PyErr_NoMemory();
2225
0
            obj = NULL;
2226
0
            goto end;
2227
0
        }
2228
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2229
0
        ids->array = new_array;
2230
0
        ids->size = new_size;
2231
0
    }
2232
2233
    // The array stores a strong reference
2234
0
    ids->array[index] = obj;
2235
2236
0
end:
2237
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2238
    // Return a borrowed reference
2239
0
    return obj;
2240
0
}
2241
2242
2243
static void
2244
unicode_clear_identifiers(struct _Py_unicode_state *state)
2245
0
{
2246
0
    struct _Py_unicode_ids *ids = &state->ids;
2247
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2248
0
        Py_XDECREF(ids->array[i]);
2249
0
    }
2250
0
    ids->size = 0;
2251
0
    PyMem_Free(ids->array);
2252
0
    ids->array = NULL;
2253
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2254
    // after Py_Finalize().
2255
0
}
2256
2257
2258
/* Internal function, doesn't check maximum character */
2259
2260
PyObject*
2261
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2262
84.7M
{
2263
84.7M
    const unsigned char *s = (const unsigned char *)buffer;
2264
84.7M
    PyObject *unicode;
2265
84.7M
    if (size == 1) {
2266
#ifdef Py_DEBUG
2267
        assert((unsigned char)s[0] < 128);
2268
#endif
2269
30.0M
        return get_latin1_char(s[0]);
2270
30.0M
    }
2271
54.6M
    unicode = PyUnicode_New(size, 127);
2272
54.6M
    if (!unicode)
2273
0
        return NULL;
2274
54.6M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2275
54.6M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2276
54.6M
    return unicode;
2277
54.6M
}
2278
2279
static Py_UCS4
2280
kind_maxchar_limit(int kind)
2281
0
{
2282
0
    switch (kind) {
2283
0
    case PyUnicode_1BYTE_KIND:
2284
0
        return 0x80;
2285
0
    case PyUnicode_2BYTE_KIND:
2286
0
        return 0x100;
2287
0
    case PyUnicode_4BYTE_KIND:
2288
0
        return 0x10000;
2289
0
    default:
2290
0
        Py_UNREACHABLE();
2291
0
    }
2292
0
}
2293
2294
static PyObject*
2295
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2296
57.0M
{
2297
57.0M
    PyObject *res;
2298
57.0M
    unsigned char max_char;
2299
2300
57.0M
    if (size == 0) {
2301
5.36M
        _Py_RETURN_UNICODE_EMPTY();
2302
5.36M
    }
2303
51.6M
    assert(size > 0);
2304
51.6M
    if (size == 1) {
2305
21.5M
        return get_latin1_char(u[0]);
2306
21.5M
    }
2307
2308
30.0M
    max_char = ucs1lib_find_max_char(u, u + size);
2309
30.0M
    res = PyUnicode_New(size, max_char);
2310
30.0M
    if (!res)
2311
0
        return NULL;
2312
30.0M
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2313
30.0M
    assert(_PyUnicode_CheckConsistency(res, 1));
2314
30.0M
    return res;
2315
30.0M
}
2316
2317
static PyObject*
2318
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2319
95.0M
{
2320
95.0M
    PyObject *res;
2321
95.0M
    Py_UCS2 max_char;
2322
2323
95.0M
    if (size == 0)
2324
11.2M
        _Py_RETURN_UNICODE_EMPTY();
2325
83.7M
    assert(size > 0);
2326
83.7M
    if (size == 1)
2327
55.6M
        return unicode_char(u[0]);
2328
2329
28.0M
    max_char = ucs2lib_find_max_char(u, u + size);
2330
28.0M
    res = PyUnicode_New(size, max_char);
2331
28.0M
    if (!res)
2332
0
        return NULL;
2333
28.0M
    if (max_char >= 256)
2334
15.8M
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2335
12.2M
    else {
2336
12.2M
        _PyUnicode_CONVERT_BYTES(
2337
12.2M
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2338
12.2M
    }
2339
28.0M
    assert(_PyUnicode_CheckConsistency(res, 1));
2340
28.0M
    return res;
2341
28.0M
}
2342
2343
static PyObject*
2344
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2345
76.8M
{
2346
76.8M
    PyObject *res;
2347
76.8M
    Py_UCS4 max_char;
2348
2349
76.8M
    if (size == 0)
2350
8.42M
        _Py_RETURN_UNICODE_EMPTY();
2351
68.4M
    assert(size > 0);
2352
68.4M
    if (size == 1)
2353
48.5M
        return unicode_char(u[0]);
2354
2355
19.8M
    max_char = ucs4lib_find_max_char(u, u + size);
2356
19.8M
    res = PyUnicode_New(size, max_char);
2357
19.8M
    if (!res)
2358
0
        return NULL;
2359
19.8M
    if (max_char < 256)
2360
14.4M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2361
19.8M
                                 PyUnicode_1BYTE_DATA(res));
2362
5.37M
    else if (max_char < 0x10000)
2363
3.55M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2364
5.37M
                                 PyUnicode_2BYTE_DATA(res));
2365
1.81M
    else
2366
1.81M
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2367
19.8M
    assert(_PyUnicode_CheckConsistency(res, 1));
2368
19.8M
    return res;
2369
19.8M
}
2370
2371
2372
int
2373
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2374
                          Py_UCS4 *str,
2375
                          Py_ssize_t size)
2376
0
{
2377
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2378
2379
0
    if (size < 0) {
2380
0
        PyErr_SetString(PyExc_ValueError,
2381
0
                        "size must be positive");
2382
0
        return -1;
2383
0
    }
2384
2385
0
    if (size == 0) {
2386
0
        return 0;
2387
0
    }
2388
2389
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2390
2391
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2392
0
        return -1;
2393
0
    }
2394
2395
0
    int kind = writer->kind;
2396
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2397
0
    if (kind == PyUnicode_1BYTE_KIND) {
2398
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2399
0
                                 str, str + size,
2400
0
                                 data);
2401
0
    }
2402
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2403
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2404
0
                                 str, str + size,
2405
0
                                 data);
2406
0
    }
2407
0
    else {
2408
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2409
0
    }
2410
0
    writer->pos += size;
2411
2412
0
    return 0;
2413
0
}
2414
2415
2416
PyObject*
2417
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2418
181M
{
2419
181M
    if (size < 0) {
2420
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2421
0
        return NULL;
2422
0
    }
2423
181M
    switch (kind) {
2424
37.6M
    case PyUnicode_1BYTE_KIND:
2425
37.6M
        return _PyUnicode_FromUCS1(buffer, size);
2426
80.9M
    case PyUnicode_2BYTE_KIND:
2427
80.9M
        return _PyUnicode_FromUCS2(buffer, size);
2428
63.1M
    case PyUnicode_4BYTE_KIND:
2429
63.1M
        return _PyUnicode_FromUCS4(buffer, size);
2430
0
    default:
2431
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2432
0
        return NULL;
2433
181M
    }
2434
181M
}
2435
2436
Py_UCS4
2437
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2438
12.7M
{
2439
12.7M
    int kind;
2440
12.7M
    const void *startptr, *endptr;
2441
2442
12.7M
    assert(0 <= start);
2443
12.7M
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2444
12.7M
    assert(start <= end);
2445
2446
12.7M
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2447
0
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2448
2449
12.7M
    if (start == end)
2450
0
        return 127;
2451
2452
12.7M
    if (PyUnicode_IS_ASCII(unicode))
2453
12.6M
        return 127;
2454
2455
36.6k
    kind = PyUnicode_KIND(unicode);
2456
36.6k
    startptr = PyUnicode_DATA(unicode);
2457
36.6k
    endptr = (char *)startptr + end * kind;
2458
36.6k
    startptr = (char *)startptr + start * kind;
2459
36.6k
    switch(kind) {
2460
1.75k
    case PyUnicode_1BYTE_KIND:
2461
1.75k
        return ucs1lib_find_max_char(startptr, endptr);
2462
4.28k
    case PyUnicode_2BYTE_KIND:
2463
4.28k
        return ucs2lib_find_max_char(startptr, endptr);
2464
30.6k
    case PyUnicode_4BYTE_KIND:
2465
30.6k
        return ucs4lib_find_max_char(startptr, endptr);
2466
0
    default:
2467
0
        Py_UNREACHABLE();
2468
36.6k
    }
2469
36.6k
}
2470
2471
/* Ensure that a string uses the most efficient storage, if it is not the
2472
   case: create a new string with of the right kind. Write NULL into *p_unicode
2473
   on error. */
2474
static void
2475
unicode_adjust_maxchar(PyObject **p_unicode)
2476
0
{
2477
0
    PyObject *unicode, *copy;
2478
0
    Py_UCS4 max_char;
2479
0
    Py_ssize_t len;
2480
0
    int kind;
2481
2482
0
    assert(p_unicode != NULL);
2483
0
    unicode = *p_unicode;
2484
0
    if (PyUnicode_IS_ASCII(unicode))
2485
0
        return;
2486
2487
0
    len = PyUnicode_GET_LENGTH(unicode);
2488
0
    kind = PyUnicode_KIND(unicode);
2489
0
    if (kind == PyUnicode_1BYTE_KIND) {
2490
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2491
0
        max_char = ucs1lib_find_max_char(u, u + len);
2492
0
        if (max_char >= 128)
2493
0
            return;
2494
0
    }
2495
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2496
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2497
0
        max_char = ucs2lib_find_max_char(u, u + len);
2498
0
        if (max_char >= 256)
2499
0
            return;
2500
0
    }
2501
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2502
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2503
0
        max_char = ucs4lib_find_max_char(u, u + len);
2504
0
        if (max_char >= 0x10000)
2505
0
            return;
2506
0
    }
2507
0
    else
2508
0
        Py_UNREACHABLE();
2509
2510
0
    copy = PyUnicode_New(len, max_char);
2511
0
    if (copy != NULL)
2512
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2513
0
    Py_DECREF(unicode);
2514
0
    *p_unicode = copy;
2515
0
}
2516
2517
PyObject*
2518
_PyUnicode_Copy(PyObject *unicode)
2519
3.09M
{
2520
3.09M
    Py_ssize_t length;
2521
3.09M
    PyObject *copy;
2522
2523
3.09M
    if (!PyUnicode_Check(unicode)) {
2524
0
        PyErr_BadInternalCall();
2525
0
        return NULL;
2526
0
    }
2527
2528
3.09M
    length = PyUnicode_GET_LENGTH(unicode);
2529
3.09M
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2530
3.09M
    if (!copy)
2531
0
        return NULL;
2532
3.09M
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2533
2534
3.09M
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2535
3.09M
              length * PyUnicode_KIND(unicode));
2536
3.09M
    assert(_PyUnicode_CheckConsistency(copy, 1));
2537
3.09M
    return copy;
2538
3.09M
}
2539
2540
2541
/* Widen Unicode objects to larger buffers. Don't write terminating null
2542
   character. Return NULL on error. */
2543
2544
static void*
2545
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2546
8.95M
{
2547
8.95M
    void *result;
2548
2549
8.95M
    assert(skind < kind);
2550
8.95M
    switch (kind) {
2551
8.03M
    case PyUnicode_2BYTE_KIND:
2552
8.03M
        result = PyMem_New(Py_UCS2, len);
2553
8.03M
        if (!result)
2554
0
            return PyErr_NoMemory();
2555
8.03M
        assert(skind == PyUnicode_1BYTE_KIND);
2556
8.03M
        _PyUnicode_CONVERT_BYTES(
2557
8.03M
            Py_UCS1, Py_UCS2,
2558
8.03M
            (const Py_UCS1 *)data,
2559
8.03M
            ((const Py_UCS1 *)data) + len,
2560
8.03M
            result);
2561
8.03M
        return result;
2562
923k
    case PyUnicode_4BYTE_KIND:
2563
923k
        result = PyMem_New(Py_UCS4, len);
2564
923k
        if (!result)
2565
0
            return PyErr_NoMemory();
2566
923k
        if (skind == PyUnicode_2BYTE_KIND) {
2567
0
            _PyUnicode_CONVERT_BYTES(
2568
0
                Py_UCS2, Py_UCS4,
2569
0
                (const Py_UCS2 *)data,
2570
0
                ((const Py_UCS2 *)data) + len,
2571
0
                result);
2572
0
        }
2573
923k
        else {
2574
923k
            assert(skind == PyUnicode_1BYTE_KIND);
2575
923k
            _PyUnicode_CONVERT_BYTES(
2576
923k
                Py_UCS1, Py_UCS4,
2577
923k
                (const Py_UCS1 *)data,
2578
923k
                ((const Py_UCS1 *)data) + len,
2579
923k
                result);
2580
923k
        }
2581
923k
        return result;
2582
0
    default:
2583
0
        Py_UNREACHABLE();
2584
0
        return NULL;
2585
8.95M
    }
2586
8.95M
}
2587
2588
static Py_UCS4*
2589
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2590
        int copy_null)
2591
66.6k
{
2592
66.6k
    int kind;
2593
66.6k
    const void *data;
2594
66.6k
    Py_ssize_t len, targetlen;
2595
66.6k
    kind = PyUnicode_KIND(string);
2596
66.6k
    data = PyUnicode_DATA(string);
2597
66.6k
    len = PyUnicode_GET_LENGTH(string);
2598
66.6k
    targetlen = len;
2599
66.6k
    if (copy_null)
2600
0
        targetlen++;
2601
66.6k
    if (!target) {
2602
0
        target = PyMem_New(Py_UCS4, targetlen);
2603
0
        if (!target) {
2604
0
            PyErr_NoMemory();
2605
0
            return NULL;
2606
0
        }
2607
0
    }
2608
66.6k
    else {
2609
66.6k
        if (targetsize < targetlen) {
2610
0
            PyErr_Format(PyExc_SystemError,
2611
0
                         "string is longer than the buffer");
2612
0
            if (copy_null && 0 < targetsize)
2613
0
                target[0] = 0;
2614
0
            return NULL;
2615
0
        }
2616
66.6k
    }
2617
66.6k
    if (kind == PyUnicode_1BYTE_KIND) {
2618
44.9k
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2619
44.9k
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2620
44.9k
    }
2621
21.7k
    else if (kind == PyUnicode_2BYTE_KIND) {
2622
16.3k
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2623
16.3k
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2624
16.3k
    }
2625
5.35k
    else if (kind == PyUnicode_4BYTE_KIND) {
2626
5.35k
        memcpy(target, data, len * sizeof(Py_UCS4));
2627
5.35k
    }
2628
0
    else {
2629
0
        Py_UNREACHABLE();
2630
0
    }
2631
66.6k
    if (copy_null)
2632
0
        target[len] = 0;
2633
66.6k
    return target;
2634
66.6k
}
2635
2636
Py_UCS4*
2637
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2638
                 int copy_null)
2639
66.6k
{
2640
66.6k
    if (target == NULL || targetsize < 0) {
2641
0
        PyErr_BadInternalCall();
2642
0
        return NULL;
2643
0
    }
2644
66.6k
    return as_ucs4(string, target, targetsize, copy_null);
2645
66.6k
}
2646
2647
Py_UCS4*
2648
PyUnicode_AsUCS4Copy(PyObject *string)
2649
0
{
2650
0
    return as_ucs4(string, NULL, 0, 1);
2651
0
}
2652
2653
/* maximum number of characters required for output of %jo or %jd or %p.
2654
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2655
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2656
   plus 1 for the terminal NUL. */
2657
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2658
2659
static int
2660
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2661
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2662
16.7k
{
2663
16.7k
    Py_ssize_t length, fill, arglen;
2664
16.7k
    Py_UCS4 maxchar;
2665
2666
16.7k
    length = PyUnicode_GET_LENGTH(str);
2667
16.7k
    if ((precision == -1 || precision >= length)
2668
16.7k
        && width <= length)
2669
16.6k
        return _PyUnicodeWriter_WriteStr(writer, str);
2670
2671
58
    if (precision != -1)
2672
58
        length = Py_MIN(precision, length);
2673
2674
58
    arglen = Py_MAX(length, width);
2675
58
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2676
34
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2677
24
    else
2678
24
        maxchar = writer->maxchar;
2679
2680
58
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2681
0
        return -1;
2682
2683
58
    fill = Py_MAX(width - length, 0);
2684
58
    if (fill && !(flags & F_LJUST)) {
2685
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2686
0
            return -1;
2687
0
        writer->pos += fill;
2688
0
    }
2689
2690
58
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2691
58
                                  str, 0, length);
2692
58
    writer->pos += length;
2693
2694
58
    if (fill && (flags & F_LJUST)) {
2695
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2696
0
            return -1;
2697
0
        writer->pos += fill;
2698
0
    }
2699
2700
58
    return 0;
2701
58
}
2702
2703
static int
2704
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2705
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2706
4.80M
{
2707
    /* UTF-8 */
2708
4.80M
    Py_ssize_t *pconsumed = NULL;
2709
4.80M
    Py_ssize_t length;
2710
4.80M
    if (precision == -1) {
2711
230k
        length = strlen(str);
2712
230k
    }
2713
4.57M
    else {
2714
4.57M
        length = 0;
2715
19.0M
        while (length < precision && str[length]) {
2716
14.4M
            length++;
2717
14.4M
        }
2718
4.57M
        if (length == precision) {
2719
            /* The input string is not NUL-terminated.  If it ends with an
2720
             * incomplete UTF-8 sequence, truncate the string just before it.
2721
             * Incomplete sequences in the middle and sequences which cannot
2722
             * be valid prefixes are still treated as errors and replaced
2723
             * with \xfffd. */
2724
3.18k
            pconsumed = &length;
2725
3.18k
        }
2726
4.57M
    }
2727
2728
4.80M
    if (width < 0) {
2729
4.80M
        return unicode_decode_utf8_writer(writer, str, length,
2730
4.80M
                                          _Py_ERROR_REPLACE, "replace", pconsumed);
2731
4.80M
    }
2732
2733
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2734
0
                                                     "replace", pconsumed);
2735
0
    if (unicode == NULL)
2736
0
        return -1;
2737
2738
0
    int res = unicode_fromformat_write_str(writer, unicode,
2739
0
                                           width, -1, flags);
2740
0
    Py_DECREF(unicode);
2741
0
    return res;
2742
0
}
2743
2744
static int
2745
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2746
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2747
0
{
2748
0
    Py_ssize_t length;
2749
0
    if (precision == -1) {
2750
0
        length = wcslen(str);
2751
0
    }
2752
0
    else {
2753
0
        length = 0;
2754
0
        while (length < precision && str[length]) {
2755
0
            length++;
2756
0
        }
2757
0
    }
2758
2759
0
    if (width < 0) {
2760
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2761
0
                                             str, length);
2762
0
    }
2763
2764
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2765
0
    if (unicode == NULL)
2766
0
        return -1;
2767
2768
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2769
0
    Py_DECREF(unicode);
2770
0
    return res;
2771
0
}
2772
2773
0
#define F_LONG 1
2774
0
#define F_LONGLONG 2
2775
87.9k
#define F_SIZE 3
2776
0
#define F_PTRDIFF 4
2777
0
#define F_INTMAX 5
2778
2779
static const char*
2780
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2781
                       const char *f, va_list *vargs)
2782
29.3M
{
2783
29.3M
    const char *p;
2784
29.3M
    Py_ssize_t len;
2785
29.3M
    int flags = 0;
2786
29.3M
    Py_ssize_t width;
2787
29.3M
    Py_ssize_t precision;
2788
2789
29.3M
    p = f;
2790
29.3M
    f++;
2791
29.3M
    if (*f == '%') {
2792
4.56M
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2793
0
            return NULL;
2794
4.56M
        f++;
2795
4.56M
        return f;
2796
4.56M
    }
2797
2798
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2799
    /* Flags '+', ' ' and '#' are not particularly useful.
2800
     * They are not worth the implementation and maintenance costs.
2801
     * In addition, '#' should add "0" for "o" conversions for compatibility
2802
     * with printf, but it would confuse Python users. */
2803
24.7M
    while (1) {
2804
24.7M
        switch (*f++) {
2805
0
        case '-': flags |= F_LJUST; continue;
2806
2.55k
        case '0': flags |= F_ZERO; continue;
2807
0
        case '#': flags |= F_ALT; continue;
2808
24.7M
        }
2809
24.7M
        f--;
2810
24.7M
        break;
2811
24.7M
    }
2812
2813
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2814
24.7M
    width = -1;
2815
24.7M
    if (*f == '*') {
2816
0
        width = va_arg(*vargs, int);
2817
0
        if (width < 0) {
2818
0
            flags |= F_LJUST;
2819
0
            width = -width;
2820
0
        }
2821
0
        f++;
2822
0
    }
2823
24.7M
    else if (Py_ISDIGIT((unsigned)*f)) {
2824
2.55k
        width = *f - '0';
2825
2.55k
        f++;
2826
2.55k
        while (Py_ISDIGIT((unsigned)*f)) {
2827
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2828
0
                PyErr_SetString(PyExc_ValueError,
2829
0
                                "width too big");
2830
0
                return NULL;
2831
0
            }
2832
0
            width = (width * 10) + (*f - '0');
2833
0
            f++;
2834
0
        }
2835
2.55k
    }
2836
24.7M
    precision = -1;
2837
24.7M
    if (*f == '.') {
2838
4.57M
        f++;
2839
4.57M
        if (*f == '*') {
2840
0
            precision = va_arg(*vargs, int);
2841
0
            if (precision < 0) {
2842
0
                precision = -2;
2843
0
            }
2844
0
            f++;
2845
0
        }
2846
4.57M
        else if (Py_ISDIGIT((unsigned)*f)) {
2847
4.57M
            precision = (*f - '0');
2848
4.57M
            f++;
2849
13.7M
            while (Py_ISDIGIT((unsigned)*f)) {
2850
9.14M
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2851
0
                    PyErr_SetString(PyExc_ValueError,
2852
0
                                    "precision too big");
2853
0
                    return NULL;
2854
0
                }
2855
9.14M
                precision = (precision * 10) + (*f - '0');
2856
9.14M
                f++;
2857
9.14M
            }
2858
4.57M
        }
2859
4.57M
    }
2860
2861
24.7M
    int sizemod = 0;
2862
24.7M
    if (*f == 'l') {
2863
0
        if (f[1] == 'l') {
2864
0
            sizemod = F_LONGLONG;
2865
0
            f += 2;
2866
0
        }
2867
0
        else {
2868
0
            sizemod = F_LONG;
2869
0
            ++f;
2870
0
        }
2871
0
    }
2872
24.7M
    else if (*f == 'z') {
2873
43.9k
        sizemod = F_SIZE;
2874
43.9k
        ++f;
2875
43.9k
    }
2876
24.7M
    else if (*f == 't') {
2877
0
        sizemod = F_PTRDIFF;
2878
0
        ++f;
2879
0
    }
2880
24.7M
    else if (*f == 'j') {
2881
0
        sizemod = F_INTMAX;
2882
0
        ++f;
2883
0
    }
2884
24.7M
    if (f[0] != '\0' && f[1] == '\0')
2885
4.65M
        writer->overallocate = 0;
2886
2887
24.7M
    switch (*f) {
2888
15.3M
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2889
15.3M
        break;
2890
4.57M
    case 'c': case 'p':
2891
4.57M
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2892
4.57M
        break;
2893
4.80M
    case 's':
2894
4.80M
    case 'V':
2895
4.80M
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2896
4.80M
        break;
2897
4.80M
    default:
2898
16.7k
        if (sizemod) goto invalid_format;
2899
16.7k
        break;
2900
24.7M
    }
2901
2902
24.7M
    switch (*f) {
2903
4.57M
    case 'c':
2904
4.57M
    {
2905
4.57M
        int ordinal = va_arg(*vargs, int);
2906
4.57M
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2907
0
            PyErr_SetString(PyExc_OverflowError,
2908
0
                            "character argument not in range(0x110000)");
2909
0
            return NULL;
2910
0
        }
2911
4.57M
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2912
0
            return NULL;
2913
4.57M
        break;
2914
4.57M
    }
2915
2916
15.3M
    case 'd': case 'i':
2917
15.3M
    case 'o': case 'u': case 'x': case 'X':
2918
15.3M
    {
2919
15.3M
        char buffer[MAX_INTMAX_CHARS];
2920
2921
        // Fill buffer using sprinf, with one of many possible format
2922
        // strings, like "%llX" for `long long` in hexadecimal.
2923
        // The type/size is in `sizemod`; the format is in `*f`.
2924
2925
        // Use macros with nested switches to keep the sprintf format strings
2926
        // as compile-time literals, avoiding warnings and maybe allowing
2927
        // optimizations.
2928
2929
        // `SPRINT` macro does one sprintf
2930
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2931
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2932
15.3M
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2933
15.3M
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2934
2935
        // One inner switch to handle all format variants
2936
15.3M
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2937
15.3M
            switch (*f) {                                                     \
2938
0
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2939
0
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2940
1.50k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2941
1.04k
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2942
15.3M
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2943
15.3M
            }
2944
2945
        // Outer switch to handle all the sizes/types
2946
15.3M
        switch (sizemod) {
2947
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2948
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2949
43.9k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2950
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2951
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2952
15.3M
            default:         DO_SPRINTS("", int, unsigned int); break;
2953
15.3M
        }
2954
15.3M
        #undef SPRINT
2955
15.3M
        #undef DO_SPRINTS
2956
2957
15.3M
        assert(len >= 0);
2958
2959
15.3M
        int sign = (buffer[0] == '-');
2960
15.3M
        len -= sign;
2961
2962
15.3M
        precision = Py_MAX(precision, len);
2963
15.3M
        width = Py_MAX(width, precision + sign);
2964
15.3M
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2965
2.55k
            precision = width - sign;
2966
2.55k
        }
2967
2968
15.3M
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2969
15.3M
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2970
2971
15.3M
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2972
0
            return NULL;
2973
2974
15.3M
        if (spacepad && !(flags & F_LJUST)) {
2975
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2976
0
                return NULL;
2977
0
            writer->pos += spacepad;
2978
0
        }
2979
2980
15.3M
        if (sign) {
2981
0
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2982
0
                return NULL;
2983
0
        }
2984
2985
15.3M
        if (zeropad) {
2986
672
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2987
0
                return NULL;
2988
672
            writer->pos += zeropad;
2989
672
        }
2990
2991
15.3M
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2992
0
            return NULL;
2993
2994
15.3M
        if (spacepad && (flags & F_LJUST)) {
2995
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2996
0
                return NULL;
2997
0
            writer->pos += spacepad;
2998
0
        }
2999
15.3M
        break;
3000
15.3M
    }
3001
3002
15.3M
    case 'p':
3003
0
    {
3004
0
        char number[MAX_INTMAX_CHARS];
3005
3006
0
        len = sprintf(number, "%p", va_arg(*vargs, void*));
3007
0
        assert(len >= 0);
3008
3009
        /* %p is ill-defined:  ensure leading 0x. */
3010
0
        if (number[1] == 'X')
3011
0
            number[1] = 'x';
3012
0
        else if (number[1] != 'x') {
3013
0
            memmove(number + 2, number,
3014
0
                    strlen(number) + 1);
3015
0
            number[0] = '0';
3016
0
            number[1] = 'x';
3017
0
            len += 2;
3018
0
        }
3019
3020
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
3021
0
            return NULL;
3022
0
        break;
3023
0
    }
3024
3025
4.80M
    case 's':
3026
4.80M
    {
3027
4.80M
        if (sizemod) {
3028
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
3029
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
3030
0
                return NULL;
3031
0
        }
3032
4.80M
        else {
3033
            /* UTF-8 */
3034
4.80M
            const char *s = va_arg(*vargs, const char*);
3035
4.80M
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
3036
0
                return NULL;
3037
4.80M
        }
3038
4.80M
        break;
3039
4.80M
    }
3040
3041
4.80M
    case 'U':
3042
16.0k
    {
3043
16.0k
        PyObject *obj = va_arg(*vargs, PyObject *);
3044
16.0k
        assert(obj && _PyUnicode_CHECK(obj));
3045
3046
16.0k
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
3047
0
            return NULL;
3048
16.0k
        break;
3049
16.0k
    }
3050
3051
16.0k
    case 'V':
3052
0
    {
3053
0
        PyObject *obj = va_arg(*vargs, PyObject *);
3054
0
        const char *str;
3055
0
        const wchar_t *wstr;
3056
0
        if (sizemod) {
3057
0
            wstr = va_arg(*vargs, const wchar_t*);
3058
0
        }
3059
0
        else {
3060
0
            str = va_arg(*vargs, const char *);
3061
0
        }
3062
0
        if (obj) {
3063
0
            assert(_PyUnicode_CHECK(obj));
3064
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
3065
0
                return NULL;
3066
0
        }
3067
0
        else if (sizemod) {
3068
0
            assert(wstr != NULL);
3069
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
3070
0
                return NULL;
3071
0
        }
3072
0
        else {
3073
0
            assert(str != NULL);
3074
0
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
3075
0
                return NULL;
3076
0
        }
3077
0
        break;
3078
0
    }
3079
3080
23
    case 'S':
3081
23
    {
3082
23
        PyObject *obj = va_arg(*vargs, PyObject *);
3083
23
        PyObject *str;
3084
23
        assert(obj);
3085
23
        str = PyObject_Str(obj);
3086
23
        if (!str)
3087
0
            return NULL;
3088
23
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
3089
0
            Py_DECREF(str);
3090
0
            return NULL;
3091
0
        }
3092
23
        Py_DECREF(str);
3093
23
        break;
3094
23
    }
3095
3096
612
    case 'R':
3097
612
    {
3098
612
        PyObject *obj = va_arg(*vargs, PyObject *);
3099
612
        PyObject *repr;
3100
612
        assert(obj);
3101
612
        repr = PyObject_Repr(obj);
3102
612
        if (!repr)
3103
0
            return NULL;
3104
612
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
3105
0
            Py_DECREF(repr);
3106
0
            return NULL;
3107
0
        }
3108
612
        Py_DECREF(repr);
3109
612
        break;
3110
612
    }
3111
3112
0
    case 'A':
3113
0
    {
3114
0
        PyObject *obj = va_arg(*vargs, PyObject *);
3115
0
        PyObject *ascii;
3116
0
        assert(obj);
3117
0
        ascii = PyObject_ASCII(obj);
3118
0
        if (!ascii)
3119
0
            return NULL;
3120
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
3121
0
            Py_DECREF(ascii);
3122
0
            return NULL;
3123
0
        }
3124
0
        Py_DECREF(ascii);
3125
0
        break;
3126
0
    }
3127
3128
0
    case 'T':
3129
0
    {
3130
0
        PyObject *obj = va_arg(*vargs, PyObject *);
3131
0
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
3132
3133
0
        PyObject *type_name;
3134
0
        if (flags & F_ALT) {
3135
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3136
0
        }
3137
0
        else {
3138
0
            type_name = PyType_GetFullyQualifiedName(type);
3139
0
        }
3140
0
        Py_DECREF(type);
3141
0
        if (!type_name) {
3142
0
            return NULL;
3143
0
        }
3144
3145
0
        if (unicode_fromformat_write_str(writer, type_name,
3146
0
                                         width, precision, flags) == -1) {
3147
0
            Py_DECREF(type_name);
3148
0
            return NULL;
3149
0
        }
3150
0
        Py_DECREF(type_name);
3151
0
        break;
3152
0
    }
3153
3154
0
    case 'N':
3155
0
    {
3156
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3157
0
        assert(type_raw != NULL);
3158
3159
0
        if (!PyType_Check(type_raw)) {
3160
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3161
0
            return NULL;
3162
0
        }
3163
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3164
3165
0
        PyObject *type_name;
3166
0
        if (flags & F_ALT) {
3167
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3168
0
        }
3169
0
        else {
3170
0
            type_name = PyType_GetFullyQualifiedName(type);
3171
0
        }
3172
0
        if (!type_name) {
3173
0
            return NULL;
3174
0
        }
3175
0
        if (unicode_fromformat_write_str(writer, type_name,
3176
0
                                         width, precision, flags) == -1) {
3177
0
            Py_DECREF(type_name);
3178
0
            return NULL;
3179
0
        }
3180
0
        Py_DECREF(type_name);
3181
0
        break;
3182
0
    }
3183
3184
0
    default:
3185
0
    invalid_format:
3186
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3187
0
        return NULL;
3188
24.7M
    }
3189
3190
24.7M
    f++;
3191
24.7M
    return f;
3192
24.7M
}
3193
3194
static int
3195
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3196
12.4M
{
3197
12.4M
    Py_ssize_t len = strlen(format);
3198
12.4M
    writer->min_length += len + 100;
3199
12.4M
    writer->overallocate = 1;
3200
3201
    // Copy varags to be able to pass a reference to a subfunction.
3202
12.4M
    va_list vargs2;
3203
12.4M
    va_copy(vargs2, vargs);
3204
3205
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3206
    // to be encoded to ASCII.
3207
12.4M
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3208
12.4M
    if (!is_ascii) {
3209
0
        Py_ssize_t i;
3210
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3211
0
        PyErr_Format(PyExc_ValueError,
3212
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3213
0
            "string, got a non-ASCII byte: 0x%02x",
3214
0
            (unsigned char)format[i]);
3215
0
        goto fail;
3216
0
    }
3217
3218
69.6M
    for (const char *f = format; *f; ) {
3219
57.2M
        if (*f == '%') {
3220
29.3M
            f = unicode_fromformat_arg(writer, f, &vargs2);
3221
29.3M
            if (f == NULL)
3222
0
                goto fail;
3223
29.3M
        }
3224
27.8M
        else {
3225
27.8M
            const char *p = strchr(f, '%');
3226
27.8M
            if (p != NULL) {
3227
20.1M
                len = p - f;
3228
20.1M
            }
3229
7.77M
            else {
3230
7.77M
                len = strlen(f);
3231
7.77M
                writer->overallocate = 0;
3232
7.77M
            }
3233
3234
27.8M
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3235
0
                goto fail;
3236
0
            }
3237
27.8M
            f += len;
3238
27.8M
        }
3239
57.2M
    }
3240
12.4M
    va_end(vargs2);
3241
12.4M
    return 0;
3242
3243
0
  fail:
3244
0
    va_end(vargs2);
3245
0
    return -1;
3246
12.4M
}
3247
3248
PyObject *
3249
PyUnicode_FromFormatV(const char *format, va_list vargs)
3250
12.4M
{
3251
12.4M
    _PyUnicodeWriter writer;
3252
12.4M
    _PyUnicodeWriter_Init(&writer);
3253
3254
12.4M
    if (unicode_from_format(&writer, format, vargs) < 0) {
3255
0
        _PyUnicodeWriter_Dealloc(&writer);
3256
0
        return NULL;
3257
0
    }
3258
12.4M
    return _PyUnicodeWriter_Finish(&writer);
3259
12.4M
}
3260
3261
PyObject *
3262
PyUnicode_FromFormat(const char *format, ...)
3263
14.6k
{
3264
14.6k
    PyObject* ret;
3265
14.6k
    va_list vargs;
3266
3267
14.6k
    va_start(vargs, format);
3268
14.6k
    ret = PyUnicode_FromFormatV(format, vargs);
3269
14.6k
    va_end(vargs);
3270
14.6k
    return ret;
3271
14.6k
}
3272
3273
int
3274
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3275
0
{
3276
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3277
0
    Py_ssize_t old_pos = _writer->pos;
3278
3279
0
    va_list vargs;
3280
0
    va_start(vargs, format);
3281
0
    int res = unicode_from_format(_writer, format, vargs);
3282
0
    va_end(vargs);
3283
3284
0
    if (res < 0) {
3285
0
        _writer->pos = old_pos;
3286
0
    }
3287
0
    return res;
3288
0
}
3289
3290
static Py_ssize_t
3291
unicode_get_widechar_size(PyObject *unicode)
3292
1.73k
{
3293
1.73k
    Py_ssize_t res;
3294
3295
1.73k
    assert(unicode != NULL);
3296
1.73k
    assert(_PyUnicode_CHECK(unicode));
3297
3298
1.73k
    res = _PyUnicode_LENGTH(unicode);
3299
#if SIZEOF_WCHAR_T == 2
3300
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3301
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3302
        const Py_UCS4 *end = s + res;
3303
        for (; s < end; ++s) {
3304
            if (*s > 0xFFFF) {
3305
                ++res;
3306
            }
3307
        }
3308
    }
3309
#endif
3310
1.73k
    return res;
3311
1.73k
}
3312
3313
static void
3314
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3315
1.73k
{
3316
1.73k
    assert(unicode != NULL);
3317
1.73k
    assert(_PyUnicode_CHECK(unicode));
3318
3319
1.73k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3320
0
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3321
0
        return;
3322
0
    }
3323
3324
1.73k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3325
1.73k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3326
111k
        for (; size--; ++s, ++w) {
3327
109k
            *w = *s;
3328
109k
        }
3329
1.73k
    }
3330
0
    else {
3331
0
#if SIZEOF_WCHAR_T == 4
3332
0
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3333
0
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3334
0
        for (; size--; ++s, ++w) {
3335
0
            *w = *s;
3336
0
        }
3337
#else
3338
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3339
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3340
        for (; size--; ++s, ++w) {
3341
            Py_UCS4 ch = *s;
3342
            if (ch > 0xFFFF) {
3343
                assert(ch <= MAX_UNICODE);
3344
                /* encode surrogate pair in this case */
3345
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3346
                if (!size--)
3347
                    break;
3348
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3349
            }
3350
            else {
3351
                *w = ch;
3352
            }
3353
        }
3354
#endif
3355
0
    }
3356
1.73k
}
3357
3358
#ifdef HAVE_WCHAR_H
3359
3360
/* Convert a Unicode object to a wide character string.
3361
3362
   - If w is NULL: return the number of wide characters (including the null
3363
     character) required to convert the unicode object. Ignore size argument.
3364
3365
   - Otherwise: return the number of wide characters (excluding the null
3366
     character) written into w. Write at most size wide characters (including
3367
     the null character). */
3368
Py_ssize_t
3369
PyUnicode_AsWideChar(PyObject *unicode,
3370
                     wchar_t *w,
3371
                     Py_ssize_t size)
3372
469
{
3373
469
    Py_ssize_t res;
3374
3375
469
    if (unicode == NULL) {
3376
0
        PyErr_BadInternalCall();
3377
0
        return -1;
3378
0
    }
3379
469
    if (!PyUnicode_Check(unicode)) {
3380
0
        PyErr_BadArgument();
3381
0
        return -1;
3382
0
    }
3383
3384
469
    res = unicode_get_widechar_size(unicode);
3385
469
    if (w == NULL) {
3386
0
        return res + 1;
3387
0
    }
3388
3389
469
    if (size > res) {
3390
469
        size = res + 1;
3391
469
    }
3392
0
    else {
3393
0
        res = size;
3394
0
    }
3395
469
    unicode_copy_as_widechar(unicode, w, size);
3396
3397
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3398
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3399
       non-Unicode locales and hence needs conversion first. */
3400
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3401
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3402
            return -1;
3403
        }
3404
    }
3405
#endif
3406
3407
469
    return res;
3408
469
}
3409
3410
wchar_t*
3411
PyUnicode_AsWideCharString(PyObject *unicode,
3412
                           Py_ssize_t *size)
3413
1.26k
{
3414
1.26k
    wchar_t *buffer;
3415
1.26k
    Py_ssize_t buflen;
3416
3417
1.26k
    if (unicode == NULL) {
3418
0
        PyErr_BadInternalCall();
3419
0
        return NULL;
3420
0
    }
3421
1.26k
    if (!PyUnicode_Check(unicode)) {
3422
0
        PyErr_BadArgument();
3423
0
        return NULL;
3424
0
    }
3425
3426
1.26k
    buflen = unicode_get_widechar_size(unicode);
3427
1.26k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3428
1.26k
    if (buffer == NULL) {
3429
0
        PyErr_NoMemory();
3430
0
        return NULL;
3431
0
    }
3432
1.26k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3433
3434
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3435
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3436
       non-Unicode locales and hence needs conversion first. */
3437
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3438
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3439
            return NULL;
3440
        }
3441
    }
3442
#endif
3443
3444
1.26k
    if (size != NULL) {
3445
820
        *size = buflen;
3446
820
    }
3447
448
    else if (wcslen(buffer) != (size_t)buflen) {
3448
0
        PyMem_Free(buffer);
3449
0
        PyErr_SetString(PyExc_ValueError,
3450
0
                        "embedded null character");
3451
0
        return NULL;
3452
0
    }
3453
1.26k
    return buffer;
3454
1.26k
}
3455
3456
#endif /* HAVE_WCHAR_H */
3457
3458
int
3459
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3460
0
{
3461
0
    wchar_t **p = (wchar_t **)ptr;
3462
0
    if (obj == NULL) {
3463
0
        PyMem_Free(*p);
3464
0
        *p = NULL;
3465
0
        return 1;
3466
0
    }
3467
0
    if (PyUnicode_Check(obj)) {
3468
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3469
0
        if (*p == NULL) {
3470
0
            return 0;
3471
0
        }
3472
0
        return Py_CLEANUP_SUPPORTED;
3473
0
    }
3474
0
    PyErr_Format(PyExc_TypeError,
3475
0
                 "argument must be str, not %.50s",
3476
0
                 Py_TYPE(obj)->tp_name);
3477
0
    return 0;
3478
0
}
3479
3480
int
3481
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3482
0
{
3483
0
    wchar_t **p = (wchar_t **)ptr;
3484
0
    if (obj == NULL) {
3485
0
        PyMem_Free(*p);
3486
0
        *p = NULL;
3487
0
        return 1;
3488
0
    }
3489
0
    if (obj == Py_None) {
3490
0
        *p = NULL;
3491
0
        return 1;
3492
0
    }
3493
0
    if (PyUnicode_Check(obj)) {
3494
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3495
0
        if (*p == NULL) {
3496
0
            return 0;
3497
0
        }
3498
0
        return Py_CLEANUP_SUPPORTED;
3499
0
    }
3500
0
    PyErr_Format(PyExc_TypeError,
3501
0
                 "argument must be str or None, not %.50s",
3502
0
                 Py_TYPE(obj)->tp_name);
3503
0
    return 0;
3504
0
}
3505
3506
PyObject *
3507
PyUnicode_FromOrdinal(int ordinal)
3508
184k
{
3509
184k
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3510
0
        PyErr_SetString(PyExc_ValueError,
3511
0
                        "chr() arg not in range(0x110000)");
3512
0
        return NULL;
3513
0
    }
3514
3515
184k
    return unicode_char((Py_UCS4)ordinal);
3516
184k
}
3517
3518
PyObject *
3519
PyUnicode_FromObject(PyObject *obj)
3520
9.04M
{
3521
    /* XXX Perhaps we should make this API an alias of
3522
       PyObject_Str() instead ?! */
3523
9.04M
    if (PyUnicode_CheckExact(obj)) {
3524
9.04M
        return Py_NewRef(obj);
3525
9.04M
    }
3526
0
    if (PyUnicode_Check(obj)) {
3527
        /* For a Unicode subtype that's not a Unicode object,
3528
           return a true Unicode object with the same data. */
3529
0
        return _PyUnicode_Copy(obj);
3530
0
    }
3531
0
    PyErr_Format(PyExc_TypeError,
3532
0
                 "Can't convert '%.100s' object to str implicitly",
3533
0
                 Py_TYPE(obj)->tp_name);
3534
0
    return NULL;
3535
0
}
3536
3537
PyObject *
3538
PyUnicode_FromEncodedObject(PyObject *obj,
3539
                            const char *encoding,
3540
                            const char *errors)
3541
4.17M
{
3542
4.17M
    Py_buffer buffer;
3543
4.17M
    PyObject *v;
3544
3545
4.17M
    if (obj == NULL) {
3546
0
        PyErr_BadInternalCall();
3547
0
        return NULL;
3548
0
    }
3549
3550
    /* Decoding bytes objects is the most common case and should be fast */
3551
4.17M
    if (PyBytes_Check(obj)) {
3552
3.83M
        if (PyBytes_GET_SIZE(obj) == 0) {
3553
572k
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3554
0
                return NULL;
3555
0
            }
3556
572k
            _Py_RETURN_UNICODE_EMPTY();
3557
572k
        }
3558
3.25M
        return PyUnicode_Decode(
3559
3.25M
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3560
3.25M
                encoding, errors);
3561
3.83M
    }
3562
3563
346k
    if (PyUnicode_Check(obj)) {
3564
0
        PyErr_SetString(PyExc_TypeError,
3565
0
                        "decoding str is not supported");
3566
0
        return NULL;
3567
0
    }
3568
3569
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3570
346k
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3571
0
        PyErr_Format(PyExc_TypeError,
3572
0
                     "decoding to str: need a bytes-like object, %.80s found",
3573
0
                     Py_TYPE(obj)->tp_name);
3574
0
        return NULL;
3575
0
    }
3576
3577
346k
    if (buffer.len == 0) {
3578
0
        PyBuffer_Release(&buffer);
3579
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3580
0
            return NULL;
3581
0
        }
3582
0
        _Py_RETURN_UNICODE_EMPTY();
3583
0
    }
3584
3585
346k
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3586
346k
    PyBuffer_Release(&buffer);
3587
346k
    return v;
3588
346k
}
3589
3590
/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3591
   also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3592
   longer than lower_len-1). */
3593
int
3594
_Py_normalize_encoding(const char *encoding,
3595
                       char *lower,
3596
                       size_t lower_len)
3597
8.09M
{
3598
8.09M
    const char *e;
3599
8.09M
    char *l;
3600
8.09M
    char *l_end;
3601
8.09M
    int punct;
3602
3603
8.09M
    assert(encoding != NULL);
3604
3605
8.09M
    e = encoding;
3606
8.09M
    l = lower;
3607
8.09M
    l_end = &lower[lower_len - 1];
3608
8.09M
    punct = 0;
3609
148M
    while (1) {
3610
148M
        char c = *e;
3611
148M
        if (c == 0) {
3612
7.39M
            break;
3613
7.39M
        }
3614
3615
141M
        if (Py_ISALNUM(c) || c == '.') {
3616
61.5M
            if (punct && l != lower) {
3617
7.24M
                if (l == l_end) {
3618
2.64k
                    return 0;
3619
2.64k
                }
3620
7.23M
                *l++ = '_';
3621
7.23M
            }
3622
61.5M
            punct = 0;
3623
3624
61.5M
            if (l == l_end) {
3625
694k
                return 0;
3626
694k
            }
3627
60.8M
            *l++ = Py_TOLOWER(c);
3628
60.8M
        }
3629
80.0M
        else {
3630
80.0M
            punct = 1;
3631
80.0M
        }
3632
3633
140M
        e++;
3634
140M
    }
3635
7.39M
    *l = '\0';
3636
7.39M
    return 1;
3637
8.09M
}
3638
3639
PyObject *
3640
PyUnicode_Decode(const char *s,
3641
                 Py_ssize_t size,
3642
                 const char *encoding,
3643
                 const char *errors)
3644
3.61M
{
3645
3.61M
    PyObject *buffer = NULL, *unicode;
3646
3.61M
    Py_buffer info;
3647
3.61M
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3648
3649
3.61M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3650
0
        return NULL;
3651
0
    }
3652
3653
3.61M
    if (size == 0) {
3654
0
        _Py_RETURN_UNICODE_EMPTY();
3655
0
    }
3656
3657
3.61M
    if (encoding == NULL) {
3658
31.0k
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3659
31.0k
    }
3660
3661
    /* Shortcuts for common default encodings */
3662
3.58M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3663
3.57M
        char *lower = buflower;
3664
3665
        /* Fast paths */
3666
3.57M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3667
741k
            lower += 3;
3668
741k
            if (*lower == '_') {
3669
                /* Match "utf8" and "utf_8" */
3670
740k
                lower++;
3671
740k
            }
3672
3673
741k
            if (lower[0] == '8' && lower[1] == 0) {
3674
740k
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3675
740k
            }
3676
729
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3677
103
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3678
103
            }
3679
626
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3680
47
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3681
47
            }
3682
741k
        }
3683
2.83M
        else {
3684
2.83M
            if (strcmp(lower, "ascii") == 0
3685
2.83M
                || strcmp(lower, "us_ascii") == 0) {
3686
732k
                return PyUnicode_DecodeASCII(s, size, errors);
3687
732k
            }
3688
    #ifdef MS_WINDOWS
3689
            else if (strcmp(lower, "mbcs") == 0) {
3690
                return PyUnicode_DecodeMBCS(s, size, errors);
3691
            }
3692
    #endif
3693
2.09M
            else if (strcmp(lower, "latin1") == 0
3694
2.09M
                     || strcmp(lower, "latin_1") == 0
3695
2.09M
                     || strcmp(lower, "iso_8859_1") == 0
3696
2.09M
                     || strcmp(lower, "iso8859_1") == 0) {
3697
1.81M
                return PyUnicode_DecodeLatin1(s, size, errors);
3698
1.81M
            }
3699
2.83M
        }
3700
3.57M
    }
3701
3702
    /* Decode via the codec registry */
3703
297k
    buffer = NULL;
3704
297k
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3705
0
        goto onError;
3706
297k
    buffer = PyMemoryView_FromBuffer(&info);
3707
297k
    if (buffer == NULL)
3708
0
        goto onError;
3709
297k
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3710
297k
    if (unicode == NULL)
3711
143k
        goto onError;
3712
153k
    if (!PyUnicode_Check(unicode)) {
3713
0
        PyErr_Format(PyExc_TypeError,
3714
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3715
0
                     "use codecs.decode() to decode to arbitrary types",
3716
0
                     encoding,
3717
0
                     Py_TYPE(unicode)->tp_name);
3718
0
        Py_DECREF(unicode);
3719
0
        goto onError;
3720
0
    }
3721
153k
    Py_DECREF(buffer);
3722
153k
    return unicode_result(unicode);
3723
3724
143k
  onError:
3725
143k
    Py_XDECREF(buffer);
3726
143k
    return NULL;
3727
153k
}
3728
3729
PyAPI_FUNC(PyObject *)
3730
PyUnicode_AsDecodedObject(PyObject *unicode,
3731
                          const char *encoding,
3732
                          const char *errors)
3733
0
{
3734
0
    if (!PyUnicode_Check(unicode)) {
3735
0
        PyErr_BadArgument();
3736
0
        return NULL;
3737
0
    }
3738
3739
0
    if (encoding == NULL)
3740
0
        encoding = PyUnicode_GetDefaultEncoding();
3741
3742
    /* Decode via the codec registry */
3743
0
    return PyCodec_Decode(unicode, encoding, errors);
3744
0
}
3745
3746
PyAPI_FUNC(PyObject *)
3747
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3748
                           const char *encoding,
3749
                           const char *errors)
3750
0
{
3751
0
    PyObject *v;
3752
3753
0
    if (!PyUnicode_Check(unicode)) {
3754
0
        PyErr_BadArgument();
3755
0
        goto onError;
3756
0
    }
3757
3758
0
    if (encoding == NULL)
3759
0
        encoding = PyUnicode_GetDefaultEncoding();
3760
3761
    /* Decode via the codec registry */
3762
0
    v = PyCodec_Decode(unicode, encoding, errors);
3763
0
    if (v == NULL)
3764
0
        goto onError;
3765
0
    if (!PyUnicode_Check(v)) {
3766
0
        PyErr_Format(PyExc_TypeError,
3767
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3768
0
                     "use codecs.decode() to decode to arbitrary types",
3769
0
                     encoding,
3770
0
                     Py_TYPE(unicode)->tp_name);
3771
0
        Py_DECREF(v);
3772
0
        goto onError;
3773
0
    }
3774
0
    return unicode_result(v);
3775
3776
0
  onError:
3777
0
    return NULL;
3778
0
}
3779
3780
PyAPI_FUNC(PyObject *)
3781
PyUnicode_AsEncodedObject(PyObject *unicode,
3782
                          const char *encoding,
3783
                          const char *errors)
3784
0
{
3785
0
    PyObject *v;
3786
3787
0
    if (!PyUnicode_Check(unicode)) {
3788
0
        PyErr_BadArgument();
3789
0
        goto onError;
3790
0
    }
3791
3792
0
    if (encoding == NULL)
3793
0
        encoding = PyUnicode_GetDefaultEncoding();
3794
3795
    /* Encode via the codec registry */
3796
0
    v = PyCodec_Encode(unicode, encoding, errors);
3797
0
    if (v == NULL)
3798
0
        goto onError;
3799
0
    return v;
3800
3801
0
  onError:
3802
0
    return NULL;
3803
0
}
3804
3805
3806
static PyObject *
3807
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3808
                      int current_locale)
3809
420
{
3810
420
    Py_ssize_t wlen;
3811
420
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3812
420
    if (wstr == NULL) {
3813
0
        return NULL;
3814
0
    }
3815
3816
420
    if ((size_t)wlen != wcslen(wstr)) {
3817
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3818
0
        PyMem_Free(wstr);
3819
0
        return NULL;
3820
0
    }
3821
3822
420
    char *str;
3823
420
    size_t error_pos;
3824
420
    const char *reason;
3825
420
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3826
420
                                 current_locale, error_handler);
3827
420
    PyMem_Free(wstr);
3828
3829
420
    if (res != 0) {
3830
0
        if (res == -2) {
3831
0
            PyObject *exc;
3832
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3833
0
                    "locale", unicode,
3834
0
                    (Py_ssize_t)error_pos,
3835
0
                    (Py_ssize_t)(error_pos+1),
3836
0
                    reason);
3837
0
            if (exc != NULL) {
3838
0
                PyCodec_StrictErrors(exc);
3839
0
                Py_DECREF(exc);
3840
0
            }
3841
0
        }
3842
0
        else if (res == -3) {
3843
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3844
0
        }
3845
0
        else {
3846
0
            PyErr_NoMemory();
3847
0
        }
3848
0
        return NULL;
3849
0
    }
3850
3851
420
    PyObject *bytes = PyBytes_FromString(str);
3852
420
    PyMem_RawFree(str);
3853
420
    return bytes;
3854
420
}
3855
3856
PyObject *
3857
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3858
0
{
3859
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3860
0
    return unicode_encode_locale(unicode, error_handler, 1);
3861
0
}
3862
3863
PyObject *
3864
PyUnicode_EncodeFSDefault(PyObject *unicode)
3865
18.2k
{
3866
18.2k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3867
18.2k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3868
18.2k
    if (fs_codec->utf8) {
3869
0
        return unicode_encode_utf8(unicode,
3870
0
                                   fs_codec->error_handler,
3871
0
                                   fs_codec->errors);
3872
0
    }
3873
18.2k
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3874
18.2k
    else if (fs_codec->encoding) {
3875
17.7k
        return PyUnicode_AsEncodedString(unicode,
3876
17.7k
                                         fs_codec->encoding,
3877
17.7k
                                         fs_codec->errors);
3878
17.7k
    }
3879
420
#endif
3880
420
    else {
3881
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3882
           machinery is not ready and so cannot be used:
3883
           use wcstombs() in this case. */
3884
420
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3885
420
        const wchar_t *filesystem_errors = config->filesystem_errors;
3886
420
        assert(filesystem_errors != NULL);
3887
420
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3888
420
        assert(errors != _Py_ERROR_UNKNOWN);
3889
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3890
        return unicode_encode_utf8(unicode, errors, NULL);
3891
#else
3892
420
        return unicode_encode_locale(unicode, errors, 0);
3893
420
#endif
3894
420
    }
3895
18.2k
}
3896
3897
PyObject *
3898
PyUnicode_AsEncodedString(PyObject *unicode,
3899
                          const char *encoding,
3900
                          const char *errors)
3901
13.8M
{
3902
13.8M
    PyObject *v;
3903
13.8M
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3904
3905
13.8M
    if (!PyUnicode_Check(unicode)) {
3906
0
        PyErr_BadArgument();
3907
0
        return NULL;
3908
0
    }
3909
3910
13.8M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3911
0
        return NULL;
3912
0
    }
3913
3914
13.8M
    if (encoding == NULL) {
3915
10.3M
        return _PyUnicode_AsUTF8String(unicode, errors);
3916
10.3M
    }
3917
3918
    /* Shortcuts for common default encodings */
3919
3.50M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3920
2.81M
        char *lower = buflower;
3921
3922
        /* Fast paths */
3923
2.81M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3924
2.67M
            lower += 3;
3925
2.67M
            if (*lower == '_') {
3926
                /* Match "utf8" and "utf_8" */
3927
2.67M
                lower++;
3928
2.67M
            }
3929
3930
2.67M
            if (lower[0] == '8' && lower[1] == 0) {
3931
2.67M
                return _PyUnicode_AsUTF8String(unicode, errors);
3932
2.67M
            }
3933
0
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3934
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3935
0
            }
3936
0
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3937
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3938
0
            }
3939
2.67M
        }
3940
140k
        else {
3941
140k
            if (strcmp(lower, "ascii") == 0
3942
140k
                || strcmp(lower, "us_ascii") == 0) {
3943
123k
                return _PyUnicode_AsASCIIString(unicode, errors);
3944
123k
            }
3945
#ifdef MS_WINDOWS
3946
            else if (strcmp(lower, "mbcs") == 0) {
3947
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3948
            }
3949
#endif
3950
17.1k
            else if (strcmp(lower, "latin1") == 0 ||
3951
17.1k
                     strcmp(lower, "latin_1") == 0 ||
3952
17.1k
                     strcmp(lower, "iso_8859_1") == 0 ||
3953
17.1k
                     strcmp(lower, "iso8859_1") == 0) {
3954
0
                return _PyUnicode_AsLatin1String(unicode, errors);
3955
0
            }
3956
140k
        }
3957
2.81M
    }
3958
3959
    /* Encode via the codec registry */
3960
705k
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3961
705k
    if (v == NULL)
3962
0
        return NULL;
3963
3964
    /* The normal path */
3965
705k
    if (PyBytes_Check(v))
3966
705k
        return v;
3967
3968
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3969
0
    if (PyByteArray_Check(v)) {
3970
0
        int error;
3971
0
        PyObject *b;
3972
3973
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3974
0
            "encoder %s returned bytearray instead of bytes; "
3975
0
            "use codecs.encode() to encode to arbitrary types",
3976
0
            encoding);
3977
0
        if (error) {
3978
0
            Py_DECREF(v);
3979
0
            return NULL;
3980
0
        }
3981
3982
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3983
0
                                      PyByteArray_GET_SIZE(v));
3984
0
        Py_DECREF(v);
3985
0
        return b;
3986
0
    }
3987
3988
0
    PyErr_Format(PyExc_TypeError,
3989
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3990
0
                 "use codecs.encode() to encode to arbitrary types",
3991
0
                 encoding,
3992
0
                 Py_TYPE(v)->tp_name);
3993
0
    Py_DECREF(v);
3994
0
    return NULL;
3995
0
}
3996
3997
PyAPI_FUNC(PyObject *)
3998
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3999
                           const char *encoding,
4000
                           const char *errors)
4001
0
{
4002
0
    PyObject *v;
4003
4004
0
    if (!PyUnicode_Check(unicode)) {
4005
0
        PyErr_BadArgument();
4006
0
        goto onError;
4007
0
    }
4008
4009
0
    if (encoding == NULL)
4010
0
        encoding = PyUnicode_GetDefaultEncoding();
4011
4012
    /* Encode via the codec registry */
4013
0
    v = PyCodec_Encode(unicode, encoding, errors);
4014
0
    if (v == NULL)
4015
0
        goto onError;
4016
0
    if (!PyUnicode_Check(v)) {
4017
0
        PyErr_Format(PyExc_TypeError,
4018
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
4019
0
                     "use codecs.encode() to encode to arbitrary types",
4020
0
                     encoding,
4021
0
                     Py_TYPE(v)->tp_name);
4022
0
        Py_DECREF(v);
4023
0
        goto onError;
4024
0
    }
4025
0
    return v;
4026
4027
0
  onError:
4028
0
    return NULL;
4029
0
}
4030
4031
static PyObject*
4032
unicode_decode_locale(const char *str, Py_ssize_t len,
4033
                      _Py_error_handler errors, int current_locale)
4034
17.2k
{
4035
17.2k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
4036
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4037
0
        return NULL;
4038
0
    }
4039
4040
17.2k
    wchar_t *wstr;
4041
17.2k
    size_t wlen;
4042
17.2k
    const char *reason;
4043
17.2k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
4044
17.2k
                                 current_locale, errors);
4045
17.2k
    if (res != 0) {
4046
0
        if (res == -2) {
4047
0
            PyObject *exc;
4048
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
4049
0
                                        "locale", str, len,
4050
0
                                        (Py_ssize_t)wlen,
4051
0
                                        (Py_ssize_t)(wlen + 1),
4052
0
                                        reason);
4053
0
            if (exc != NULL) {
4054
0
                PyCodec_StrictErrors(exc);
4055
0
                Py_DECREF(exc);
4056
0
            }
4057
0
        }
4058
0
        else if (res == -3) {
4059
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4060
0
        }
4061
0
        else {
4062
0
            PyErr_NoMemory();
4063
0
        }
4064
0
        return NULL;
4065
0
    }
4066
4067
17.2k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
4068
17.2k
    PyMem_RawFree(wstr);
4069
17.2k
    return unicode;
4070
17.2k
}
4071
4072
PyObject*
4073
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4074
                              const char *errors)
4075
0
{
4076
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4077
0
    return unicode_decode_locale(str, len, error_handler, 1);
4078
0
}
4079
4080
PyObject*
4081
PyUnicode_DecodeLocale(const char *str, const char *errors)
4082
12.0k
{
4083
12.0k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
4084
12.0k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4085
12.0k
    return unicode_decode_locale(str, size, error_handler, 1);
4086
12.0k
}
4087
4088
4089
PyObject*
4090
0
PyUnicode_DecodeFSDefault(const char *s) {
4091
0
    Py_ssize_t size = (Py_ssize_t)strlen(s);
4092
0
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
4093
0
}
4094
4095
PyObject*
4096
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4097
6.73k
{
4098
6.73k
    PyInterpreterState *interp = _PyInterpreterState_GET();
4099
6.73k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4100
6.73k
    if (fs_codec->utf8) {
4101
0
        return unicode_decode_utf8(s, size,
4102
0
                                   fs_codec->error_handler,
4103
0
                                   fs_codec->errors,
4104
0
                                   NULL);
4105
0
    }
4106
6.73k
#ifndef _Py_FORCE_UTF8_FS_ENCODING
4107
6.73k
    else if (fs_codec->encoding) {
4108
1.61k
        return PyUnicode_Decode(s, size,
4109
1.61k
                                fs_codec->encoding,
4110
1.61k
                                fs_codec->errors);
4111
1.61k
    }
4112
5.12k
#endif
4113
5.12k
    else {
4114
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
4115
           machinery is not ready and so cannot be used:
4116
           use mbstowcs() in this case. */
4117
5.12k
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4118
5.12k
        const wchar_t *filesystem_errors = config->filesystem_errors;
4119
5.12k
        assert(filesystem_errors != NULL);
4120
5.12k
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4121
5.12k
        assert(errors != _Py_ERROR_UNKNOWN);
4122
#ifdef _Py_FORCE_UTF8_FS_ENCODING
4123
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
4124
#else
4125
5.12k
        return unicode_decode_locale(s, size, errors, 0);
4126
5.12k
#endif
4127
5.12k
    }
4128
6.73k
}
4129
4130
4131
int
4132
PyUnicode_FSConverter(PyObject* arg, void* addr)
4133
12.6k
{
4134
12.6k
    PyObject *path = NULL;
4135
12.6k
    PyObject *output = NULL;
4136
12.6k
    Py_ssize_t size;
4137
12.6k
    const char *data;
4138
12.6k
    if (arg == NULL) {
4139
0
        Py_DECREF(*(PyObject**)addr);
4140
0
        *(PyObject**)addr = NULL;
4141
0
        return 1;
4142
0
    }
4143
12.6k
    path = PyOS_FSPath(arg);
4144
12.6k
    if (path == NULL) {
4145
0
        return 0;
4146
0
    }
4147
12.6k
    if (PyBytes_Check(path)) {
4148
0
        output = path;
4149
0
    }
4150
12.6k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4151
12.6k
        output = PyUnicode_EncodeFSDefault(path);
4152
12.6k
        Py_DECREF(path);
4153
12.6k
        if (!output) {
4154
0
            return 0;
4155
0
        }
4156
12.6k
        assert(PyBytes_Check(output));
4157
12.6k
    }
4158
4159
12.6k
    size = PyBytes_GET_SIZE(output);
4160
12.6k
    data = PyBytes_AS_STRING(output);
4161
12.6k
    if ((size_t)size != strlen(data)) {
4162
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4163
0
        Py_DECREF(output);
4164
0
        return 0;
4165
0
    }
4166
12.6k
    *(PyObject**)addr = output;
4167
12.6k
    return Py_CLEANUP_SUPPORTED;
4168
12.6k
}
4169
4170
4171
int
4172
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4173
22.7k
{
4174
22.7k
    if (arg == NULL) {
4175
0
        Py_DECREF(*(PyObject**)addr);
4176
0
        *(PyObject**)addr = NULL;
4177
0
        return 1;
4178
0
    }
4179
4180
22.7k
    PyObject *path = PyOS_FSPath(arg);
4181
22.7k
    if (path == NULL) {
4182
0
        return 0;
4183
0
    }
4184
4185
22.7k
    PyObject *output = NULL;
4186
22.7k
    if (PyUnicode_Check(path)) {
4187
22.7k
        output = path;
4188
22.7k
    }
4189
0
    else if (PyBytes_Check(path)) {
4190
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4191
0
                                                  PyBytes_GET_SIZE(path));
4192
0
        Py_DECREF(path);
4193
0
        if (!output) {
4194
0
            return 0;
4195
0
        }
4196
0
    }
4197
0
    else {
4198
0
        PyErr_Format(PyExc_TypeError,
4199
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4200
0
                     Py_TYPE(arg)->tp_name);
4201
0
        Py_DECREF(path);
4202
0
        return 0;
4203
0
    }
4204
4205
22.7k
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4206
22.7k
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4207
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4208
0
        Py_DECREF(output);
4209
0
        return 0;
4210
0
    }
4211
22.7k
    *(PyObject**)addr = output;
4212
22.7k
    return Py_CLEANUP_SUPPORTED;
4213
22.7k
}
4214
4215
4216
static int unicode_fill_utf8(PyObject *unicode);
4217
4218
4219
static int
4220
unicode_ensure_utf8(PyObject *unicode)
4221
15.0M
{
4222
15.0M
    int err = 0;
4223
15.0M
    if (PyUnicode_UTF8(unicode) == NULL) {
4224
121k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4225
121k
        if (PyUnicode_UTF8(unicode) == NULL) {
4226
121k
            err = unicode_fill_utf8(unicode);
4227
121k
        }
4228
121k
        Py_END_CRITICAL_SECTION();
4229
121k
    }
4230
15.0M
    return err;
4231
15.0M
}
4232
4233
const char *
4234
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4235
15.0M
{
4236
15.0M
    if (!PyUnicode_Check(unicode)) {
4237
0
        PyErr_BadArgument();
4238
0
        if (psize) {
4239
0
            *psize = -1;
4240
0
        }
4241
0
        return NULL;
4242
0
    }
4243
4244
15.0M
    if (unicode_ensure_utf8(unicode) == -1) {
4245
274
        if (psize) {
4246
274
            *psize = -1;
4247
274
        }
4248
274
        return NULL;
4249
274
    }
4250
4251
15.0M
    if (psize) {
4252
14.9M
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4253
14.9M
    }
4254
15.0M
    return PyUnicode_UTF8(unicode);
4255
15.0M
}
4256
4257
const char *
4258
PyUnicode_AsUTF8(PyObject *unicode)
4259
66.9k
{
4260
66.9k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4261
66.9k
}
4262
4263
const char *
4264
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4265
900k
{
4266
900k
    Py_ssize_t size;
4267
900k
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4268
900k
    if (s && strlen(s) != (size_t)size) {
4269
125
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4270
125
        return NULL;
4271
125
    }
4272
900k
    return s;
4273
900k
}
4274
4275
/*
4276
PyUnicode_GetSize() has been deprecated since Python 3.3
4277
because it returned length of Py_UNICODE.
4278
4279
But this function is part of stable abi, because it doesn't
4280
include Py_UNICODE in signature and it was not excluded from
4281
stable ABI in PEP 384.
4282
*/
4283
PyAPI_FUNC(Py_ssize_t)
4284
PyUnicode_GetSize(PyObject *unicode)
4285
0
{
4286
0
    PyErr_SetString(PyExc_RuntimeError,
4287
0
                    "PyUnicode_GetSize has been removed.");
4288
0
    return -1;
4289
0
}
4290
4291
Py_ssize_t
4292
PyUnicode_GetLength(PyObject *unicode)
4293
33.3k
{
4294
33.3k
    if (!PyUnicode_Check(unicode)) {
4295
0
        PyErr_BadArgument();
4296
0
        return -1;
4297
0
    }
4298
33.3k
    return PyUnicode_GET_LENGTH(unicode);
4299
33.3k
}
4300
4301
Py_UCS4
4302
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4303
35
{
4304
35
    const void *data;
4305
35
    int kind;
4306
4307
35
    if (!PyUnicode_Check(unicode)) {
4308
0
        PyErr_BadArgument();
4309
0
        return (Py_UCS4)-1;
4310
0
    }
4311
35
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4312
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4313
0
        return (Py_UCS4)-1;
4314
0
    }
4315
35
    data = PyUnicode_DATA(unicode);
4316
35
    kind = PyUnicode_KIND(unicode);
4317
35
    return PyUnicode_READ(kind, data, index);
4318
35
}
4319
4320
int
4321
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4322
0
{
4323
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4324
0
        PyErr_BadArgument();
4325
0
        return -1;
4326
0
    }
4327
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4328
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4329
0
        return -1;
4330
0
    }
4331
0
    if (unicode_check_modifiable(unicode))
4332
0
        return -1;
4333
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4334
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4335
0
        return -1;
4336
0
    }
4337
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4338
0
                    index, ch);
4339
0
    return 0;
4340
0
}
4341
4342
const char *
4343
PyUnicode_GetDefaultEncoding(void)
4344
0
{
4345
0
    return "utf-8";
4346
0
}
4347
4348
/* create or adjust a UnicodeDecodeError */
4349
static void
4350
make_decode_exception(PyObject **exceptionObject,
4351
                      const char *encoding,
4352
                      const char *input, Py_ssize_t length,
4353
                      Py_ssize_t startpos, Py_ssize_t endpos,
4354
                      const char *reason)
4355
243k
{
4356
243k
    if (*exceptionObject == NULL) {
4357
79.2k
        *exceptionObject = PyUnicodeDecodeError_Create(
4358
79.2k
            encoding, input, length, startpos, endpos, reason);
4359
79.2k
    }
4360
163k
    else {
4361
163k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4362
0
            goto onError;
4363
163k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4364
0
            goto onError;
4365
163k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4366
0
            goto onError;
4367
163k
    }
4368
243k
    return;
4369
4370
243k
onError:
4371
0
    Py_CLEAR(*exceptionObject);
4372
0
}
4373
4374
#ifdef MS_WINDOWS
4375
static int
4376
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4377
{
4378
    if (newsize > *size) {
4379
        wchar_t *newbuf = *buf;
4380
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4381
            PyErr_NoMemory();
4382
            return -1;
4383
        }
4384
        *buf = newbuf;
4385
    }
4386
    *size = newsize;
4387
    return 0;
4388
}
4389
4390
/* error handling callback helper:
4391
   build arguments, call the callback and check the arguments,
4392
   if no exception occurred, copy the replacement to the output
4393
   and adjust various state variables.
4394
   return 0 on success, -1 on error
4395
*/
4396
4397
static int
4398
unicode_decode_call_errorhandler_wchar(
4399
    const char *errors, PyObject **errorHandler,
4400
    const char *encoding, const char *reason,
4401
    const char **input, const char **inend, Py_ssize_t *startinpos,
4402
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4403
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4404
{
4405
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4406
4407
    PyObject *restuple = NULL;
4408
    PyObject *repunicode = NULL;
4409
    Py_ssize_t outsize;
4410
    Py_ssize_t insize;
4411
    Py_ssize_t requiredsize;
4412
    Py_ssize_t newpos;
4413
    PyObject *inputobj = NULL;
4414
    Py_ssize_t repwlen;
4415
4416
    if (*errorHandler == NULL) {
4417
        *errorHandler = PyCodec_LookupError(errors);
4418
        if (*errorHandler == NULL)
4419
            goto onError;
4420
    }
4421
4422
    make_decode_exception(exceptionObject,
4423
        encoding,
4424
        *input, *inend - *input,
4425
        *startinpos, *endinpos,
4426
        reason);
4427
    if (*exceptionObject == NULL)
4428
        goto onError;
4429
4430
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4431
    if (restuple == NULL)
4432
        goto onError;
4433
    if (!PyTuple_Check(restuple)) {
4434
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4435
        goto onError;
4436
    }
4437
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4438
        goto onError;
4439
4440
    /* Copy back the bytes variables, which might have been modified by the
4441
       callback */
4442
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4443
    if (!inputobj)
4444
        goto onError;
4445
    *input = PyBytes_AS_STRING(inputobj);
4446
    insize = PyBytes_GET_SIZE(inputobj);
4447
    *inend = *input + insize;
4448
    /* we can DECREF safely, as the exception has another reference,
4449
       so the object won't go away. */
4450
    Py_DECREF(inputobj);
4451
4452
    if (newpos<0)
4453
        newpos = insize+newpos;
4454
    if (newpos<0 || newpos>insize) {
4455
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4456
        goto onError;
4457
    }
4458
4459
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4460
    if (repwlen < 0)
4461
        goto onError;
4462
    repwlen--;
4463
    /* need more space? (at least enough for what we
4464
       have+the replacement+the rest of the string (starting
4465
       at the new input position), so we won't have to check space
4466
       when there are no errors in the rest of the string) */
4467
    requiredsize = *outpos;
4468
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4469
        goto overflow;
4470
    requiredsize += repwlen;
4471
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4472
        goto overflow;
4473
    requiredsize += insize - newpos;
4474
    outsize = *bufsize;
4475
    if (requiredsize > outsize) {
4476
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4477
            requiredsize = 2*outsize;
4478
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4479
            goto onError;
4480
        }
4481
    }
4482
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4483
    *outpos += repwlen;
4484
    *endinpos = newpos;
4485
    *inptr = *input + newpos;
4486
4487
    /* we made it! */
4488
    Py_DECREF(restuple);
4489
    return 0;
4490
4491
  overflow:
4492
    PyErr_SetString(PyExc_OverflowError,
4493
                    "decoded result is too long for a Python string");
4494
4495
  onError:
4496
    Py_XDECREF(restuple);
4497
    return -1;
4498
}
4499
#endif   /* MS_WINDOWS */
4500
4501
static int
4502
unicode_decode_call_errorhandler_writer(
4503
    const char *errors, PyObject **errorHandler,
4504
    const char *encoding, const char *reason,
4505
    const char **input, const char **inend, Py_ssize_t *startinpos,
4506
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4507
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4508
243k
{
4509
243k
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4510
4511
243k
    PyObject *restuple = NULL;
4512
243k
    PyObject *repunicode = NULL;
4513
243k
    Py_ssize_t insize;
4514
243k
    Py_ssize_t newpos;
4515
243k
    Py_ssize_t replen;
4516
243k
    Py_ssize_t remain;
4517
243k
    PyObject *inputobj = NULL;
4518
243k
    int need_to_grow = 0;
4519
243k
    const char *new_inptr;
4520
4521
243k
    if (*errorHandler == NULL) {
4522
79.2k
        *errorHandler = PyCodec_LookupError(errors);
4523
79.2k
        if (*errorHandler == NULL)
4524
0
            goto onError;
4525
79.2k
    }
4526
4527
243k
    make_decode_exception(exceptionObject,
4528
243k
        encoding,
4529
243k
        *input, *inend - *input,
4530
243k
        *startinpos, *endinpos,
4531
243k
        reason);
4532
243k
    if (*exceptionObject == NULL)
4533
0
        goto onError;
4534
4535
243k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4536
243k
    if (restuple == NULL)
4537
61.5k
        goto onError;
4538
181k
    if (!PyTuple_Check(restuple)) {
4539
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4540
0
        goto onError;
4541
0
    }
4542
181k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4543
0
        goto onError;
4544
4545
    /* Copy back the bytes variables, which might have been modified by the
4546
       callback */
4547
181k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4548
181k
    if (!inputobj)
4549
0
        goto onError;
4550
181k
    remain = *inend - *input - *endinpos;
4551
181k
    *input = PyBytes_AS_STRING(inputobj);
4552
181k
    insize = PyBytes_GET_SIZE(inputobj);
4553
181k
    *inend = *input + insize;
4554
    /* we can DECREF safely, as the exception has another reference,
4555
       so the object won't go away. */
4556
181k
    Py_DECREF(inputobj);
4557
4558
181k
    if (newpos<0)
4559
0
        newpos = insize+newpos;
4560
181k
    if (newpos<0 || newpos>insize) {
4561
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4562
0
        goto onError;
4563
0
    }
4564
4565
181k
    replen = PyUnicode_GET_LENGTH(repunicode);
4566
181k
    if (replen > 1) {
4567
25.6k
        writer->min_length += replen - 1;
4568
25.6k
        need_to_grow = 1;
4569
25.6k
    }
4570
181k
    new_inptr = *input + newpos;
4571
181k
    if (*inend - new_inptr > remain) {
4572
        /* We don't know the decoding algorithm here so we make the worst
4573
           assumption that one byte decodes to one unicode character.
4574
           If unfortunately one byte could decode to more unicode characters,
4575
           the decoder may write out-of-bound then.  Is it possible for the
4576
           algorithms using this function? */
4577
17.2k
        writer->min_length += *inend - new_inptr - remain;
4578
17.2k
        need_to_grow = 1;
4579
17.2k
    }
4580
181k
    if (need_to_grow) {
4581
25.8k
        writer->overallocate = 1;
4582
25.8k
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4583
25.8k
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4584
0
            goto onError;
4585
25.8k
    }
4586
181k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4587
0
        goto onError;
4588
4589
181k
    *endinpos = newpos;
4590
181k
    *inptr = new_inptr;
4591
4592
    /* we made it! */
4593
181k
    Py_DECREF(restuple);
4594
181k
    return 0;
4595
4596
61.5k
  onError:
4597
61.5k
    Py_XDECREF(restuple);
4598
61.5k
    return -1;
4599
181k
}
4600
4601
/* --- UTF-7 Codec -------------------------------------------------------- */
4602
4603
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4604
4605
/* Three simple macros defining base-64. */
4606
4607
/* Is c a base-64 character? */
4608
4609
#define IS_BASE64(c) \
4610
224k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4611
224k
     ((c) >= 'a' && (c) <= 'z') ||     \
4612
224k
     ((c) >= '0' && (c) <= '9') ||     \
4613
224k
     (c) == '+' || (c) == '/')
4614
4615
/* given that c is a base-64 character, what is its base-64 value? */
4616
4617
#define FROM_BASE64(c)                                                  \
4618
189k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4619
189k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4620
149k
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4621
84.0k
     (c) == '+' ? 62 : 63)
4622
4623
/* What is the base-64 character of the bottom 6 bits of n? */
4624
4625
#define TO_BASE64(n)  \
4626
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4627
4628
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4629
 * decoded as itself.  We are permissive on decoding; the only ASCII
4630
 * byte not decoding to itself is the + which begins a base64
4631
 * string. */
4632
4633
#define DECODE_DIRECT(c)                                \
4634
7.44M
    ((c) <= 127 && (c) != '+')
4635
4636
/* The UTF-7 encoder treats ASCII characters differently according to
4637
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4638
 * the above).  See RFC2152.  This array identifies these different
4639
 * sets:
4640
 * 0 : "Set D"
4641
 *     alphanumeric and '(),-./:?
4642
 * 1 : "Set O"
4643
 *     !"#$%&*;<=>@[]^_`{|}
4644
 * 2 : "whitespace"
4645
 *     ht nl cr sp
4646
 * 3 : special (must be base64 encoded)
4647
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4648
 */
4649
4650
static
4651
char utf7_category[128] = {
4652
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4653
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4654
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4655
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4656
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4657
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4658
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4659
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4660
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4661
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4662
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4663
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4664
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4665
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4666
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4667
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4668
};
4669
4670
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4671
 * answer depends on whether we are encoding set O as itself, and also
4672
 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4673
 * clear that the answers to these questions vary between
4674
 * applications, so this code needs to be flexible.  */
4675
4676
#define ENCODE_DIRECT(c, directO, directWS)             \
4677
0
    ((c) < 128 && (c) > 0 &&                            \
4678
0
     ((utf7_category[(c)] == 0) ||                      \
4679
0
      (directWS && (utf7_category[(c)] == 2)) ||        \
4680
0
      (directO && (utf7_category[(c)] == 1))))
4681
4682
PyObject *
4683
PyUnicode_DecodeUTF7(const char *s,
4684
                     Py_ssize_t size,
4685
                     const char *errors)
4686
0
{
4687
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4688
0
}
4689
4690
/* The decoder.  The only state we preserve is our read position,
4691
 * i.e. how many characters we have consumed.  So if we end in the
4692
 * middle of a shift sequence we have to back off the read position
4693
 * and the output to the beginning of the sequence, otherwise we lose
4694
 * all the shift state (seen bits, number of bits seen, high
4695
 * surrogate). */
4696
4697
PyObject *
4698
PyUnicode_DecodeUTF7Stateful(const char *s,
4699
                             Py_ssize_t size,
4700
                             const char *errors,
4701
                             Py_ssize_t *consumed)
4702
20.1k
{
4703
20.1k
    const char *starts = s;
4704
20.1k
    Py_ssize_t startinpos;
4705
20.1k
    Py_ssize_t endinpos;
4706
20.1k
    const char *e;
4707
20.1k
    _PyUnicodeWriter writer;
4708
20.1k
    const char *errmsg = "";
4709
20.1k
    int inShift = 0;
4710
20.1k
    Py_ssize_t shiftOutStart;
4711
20.1k
    unsigned int base64bits = 0;
4712
20.1k
    unsigned long base64buffer = 0;
4713
20.1k
    Py_UCS4 surrogate = 0;
4714
20.1k
    PyObject *errorHandler = NULL;
4715
20.1k
    PyObject *exc = NULL;
4716
4717
20.1k
    if (size == 0) {
4718
0
        if (consumed)
4719
0
            *consumed = 0;
4720
0
        _Py_RETURN_UNICODE_EMPTY();
4721
0
    }
4722
4723
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4724
20.1k
    _PyUnicodeWriter_Init(&writer);
4725
20.1k
    writer.min_length = size;
4726
4727
20.1k
    shiftOutStart = 0;
4728
20.1k
    e = s + size;
4729
4730
7.68M
    while (s < e) {
4731
7.66M
        Py_UCS4 ch;
4732
7.66M
      restart:
4733
7.66M
        ch = (unsigned char) *s;
4734
4735
7.66M
        if (inShift) { /* in a base-64 section */
4736
204k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4737
189k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4738
189k
                base64bits += 6;
4739
189k
                s++;
4740
189k
                if (base64bits >= 16) {
4741
                    /* we have enough bits for a UTF-16 value */
4742
65.5k
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4743
65.5k
                    base64bits -= 16;
4744
65.5k
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4745
65.5k
                    assert(outCh <= 0xffff);
4746
65.5k
                    if (surrogate) {
4747
                        /* expecting a second surrogate */
4748
7.77k
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4749
2.27k
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4750
2.27k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4751
0
                                goto onError;
4752
2.27k
                            surrogate = 0;
4753
2.27k
                            continue;
4754
2.27k
                        }
4755
5.49k
                        else {
4756
5.49k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4757
0
                                goto onError;
4758
5.49k
                            surrogate = 0;
4759
5.49k
                        }
4760
7.77k
                    }
4761
63.2k
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4762
                        /* first surrogate */
4763
10.0k
                        surrogate = outCh;
4764
10.0k
                    }
4765
53.1k
                    else {
4766
53.1k
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4767
0
                            goto onError;
4768
53.1k
                    }
4769
63.2k
                }
4770
189k
            }
4771
14.6k
            else { /* now leaving a base-64 section */
4772
14.6k
                inShift = 0;
4773
14.6k
                if (base64bits > 0) { /* left-over bits */
4774
11.6k
                    if (base64bits >= 6) {
4775
                        /* We've seen at least one base-64 character */
4776
6.41k
                        s++;
4777
6.41k
                        errmsg = "partial character in shift sequence";
4778
6.41k
                        goto utf7Error;
4779
6.41k
                    }
4780
5.28k
                    else {
4781
                        /* Some bits remain; they should be zero */
4782
5.28k
                        if (base64buffer != 0) {
4783
1.07k
                            s++;
4784
1.07k
                            errmsg = "non-zero padding bits in shift sequence";
4785
1.07k
                            goto utf7Error;
4786
1.07k
                        }
4787
5.28k
                    }
4788
11.6k
                }
4789
7.13k
                if (surrogate && DECODE_DIRECT(ch)) {
4790
1.84k
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4791
0
                        goto onError;
4792
1.84k
                }
4793
7.13k
                surrogate = 0;
4794
7.13k
                if (ch == '-') {
4795
                    /* '-' is absorbed; other terminating
4796
                       characters are preserved */
4797
1.77k
                    s++;
4798
1.77k
                }
4799
7.13k
            }
4800
204k
        }
4801
7.46M
        else if ( ch == '+' ) {
4802
22.7k
            startinpos = s-starts;
4803
22.7k
            s++; /* consume '+' */
4804
22.7k
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4805
2.02k
                s++;
4806
2.02k
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4807
0
                    goto onError;
4808
2.02k
            }
4809
20.7k
            else if (s < e && !IS_BASE64(*s)) {
4810
3.17k
                s++;
4811
3.17k
                errmsg = "ill-formed sequence";
4812
3.17k
                goto utf7Error;
4813
3.17k
            }
4814
17.5k
            else { /* begin base64-encoded section */
4815
17.5k
                inShift = 1;
4816
17.5k
                surrogate = 0;
4817
17.5k
                shiftOutStart = writer.pos;
4818
17.5k
                base64bits = 0;
4819
17.5k
                base64buffer = 0;
4820
17.5k
            }
4821
22.7k
        }
4822
7.44M
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4823
7.35M
            s++;
4824
7.35M
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4825
0
                goto onError;
4826
7.35M
        }
4827
84.3k
        else {
4828
84.3k
            startinpos = s-starts;
4829
84.3k
            s++;
4830
84.3k
            errmsg = "unexpected special character";
4831
84.3k
            goto utf7Error;
4832
84.3k
        }
4833
7.57M
        continue;
4834
7.57M
utf7Error:
4835
94.9k
        endinpos = s-starts;
4836
94.9k
        if (unicode_decode_call_errorhandler_writer(
4837
94.9k
                errors, &errorHandler,
4838
94.9k
                "utf7", errmsg,
4839
94.9k
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4840
94.9k
                &writer))
4841
8.22k
            goto onError;
4842
94.9k
    }
4843
4844
    /* end of string */
4845
4846
11.9k
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4847
        /* if we're in an inconsistent state, that's an error */
4848
2.96k
        inShift = 0;
4849
2.96k
        if (surrogate ||
4850
2.96k
                (base64bits >= 6) ||
4851
2.96k
                (base64bits > 0 && base64buffer != 0)) {
4852
1.90k
            endinpos = size;
4853
1.90k
            if (unicode_decode_call_errorhandler_writer(
4854
1.90k
                    errors, &errorHandler,
4855
1.90k
                    "utf7", "unterminated shift sequence",
4856
1.90k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4857
1.90k
                    &writer))
4858
1.62k
                goto onError;
4859
277
            if (s < e)
4860
0
                goto restart;
4861
277
        }
4862
2.96k
    }
4863
4864
    /* return state */
4865
10.3k
    if (consumed) {
4866
0
        if (inShift) {
4867
0
            *consumed = startinpos;
4868
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4869
0
                PyObject *result = PyUnicode_FromKindAndData(
4870
0
                        writer.kind, writer.data, shiftOutStart);
4871
0
                Py_XDECREF(errorHandler);
4872
0
                Py_XDECREF(exc);
4873
0
                _PyUnicodeWriter_Dealloc(&writer);
4874
0
                return result;
4875
0
            }
4876
0
            writer.pos = shiftOutStart; /* back off output */
4877
0
        }
4878
0
        else {
4879
0
            *consumed = s-starts;
4880
0
        }
4881
0
    }
4882
4883
10.3k
    Py_XDECREF(errorHandler);
4884
10.3k
    Py_XDECREF(exc);
4885
10.3k
    return _PyUnicodeWriter_Finish(&writer);
4886
4887
9.84k
  onError:
4888
9.84k
    Py_XDECREF(errorHandler);
4889
9.84k
    Py_XDECREF(exc);
4890
9.84k
    _PyUnicodeWriter_Dealloc(&writer);
4891
9.84k
    return NULL;
4892
10.3k
}
4893
4894
4895
PyObject *
4896
_PyUnicode_EncodeUTF7(PyObject *str,
4897
                      int base64SetO,
4898
                      int base64WhiteSpace,
4899
                      const char *errors)
4900
0
{
4901
0
    int kind;
4902
0
    const void *data;
4903
0
    Py_ssize_t len;
4904
0
    PyObject *v;
4905
0
    int inShift = 0;
4906
0
    Py_ssize_t i;
4907
0
    unsigned int base64bits = 0;
4908
0
    unsigned long base64buffer = 0;
4909
0
    char * out;
4910
0
    const char * start;
4911
4912
0
    kind = PyUnicode_KIND(str);
4913
0
    data = PyUnicode_DATA(str);
4914
0
    len = PyUnicode_GET_LENGTH(str);
4915
4916
0
    if (len == 0)
4917
0
        return PyBytes_FromStringAndSize(NULL, 0);
4918
4919
    /* It might be possible to tighten this worst case */
4920
0
    if (len > PY_SSIZE_T_MAX / 8)
4921
0
        return PyErr_NoMemory();
4922
0
    v = PyBytes_FromStringAndSize(NULL, len * 8);
4923
0
    if (v == NULL)
4924
0
        return NULL;
4925
4926
0
    start = out = PyBytes_AS_STRING(v);
4927
0
    for (i = 0; i < len; ++i) {
4928
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4929
4930
0
        if (inShift) {
4931
0
            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4932
                /* shifting out */
4933
0
                if (base64bits) { /* output remaining bits */
4934
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4935
0
                    base64buffer = 0;
4936
0
                    base64bits = 0;
4937
0
                }
4938
0
                inShift = 0;
4939
                /* Characters not in the BASE64 set implicitly unshift the sequence
4940
                   so no '-' is required, except if the character is itself a '-' */
4941
0
                if (IS_BASE64(ch) || ch == '-') {
4942
0
                    *out++ = '-';
4943
0
                }
4944
0
                *out++ = (char) ch;
4945
0
            }
4946
0
            else {
4947
0
                goto encode_char;
4948
0
            }
4949
0
        }
4950
0
        else { /* not in a shift sequence */
4951
0
            if (ch == '+') {
4952
0
                *out++ = '+';
4953
0
                        *out++ = '-';
4954
0
            }
4955
0
            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4956
0
                *out++ = (char) ch;
4957
0
            }
4958
0
            else {
4959
0
                *out++ = '+';
4960
0
                inShift = 1;
4961
0
                goto encode_char;
4962
0
            }
4963
0
        }
4964
0
        continue;
4965
0
encode_char:
4966
0
        if (ch >= 0x10000) {
4967
0
            assert(ch <= MAX_UNICODE);
4968
4969
            /* code first surrogate */
4970
0
            base64bits += 16;
4971
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4972
0
            while (base64bits >= 6) {
4973
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4974
0
                base64bits -= 6;
4975
0
            }
4976
            /* prepare second surrogate */
4977
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4978
0
        }
4979
0
        base64bits += 16;
4980
0
        base64buffer = (base64buffer << 16) | ch;
4981
0
        while (base64bits >= 6) {
4982
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4983
0
            base64bits -= 6;
4984
0
        }
4985
0
    }
4986
0
    if (base64bits)
4987
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4988
0
    if (inShift)
4989
0
        *out++ = '-';
4990
0
    if (_PyBytes_Resize(&v, out - start) < 0)
4991
0
        return NULL;
4992
0
    return v;
4993
0
}
4994
4995
#undef IS_BASE64
4996
#undef FROM_BASE64
4997
#undef TO_BASE64
4998
#undef DECODE_DIRECT
4999
#undef ENCODE_DIRECT
5000
5001
/* --- UTF-8 Codec -------------------------------------------------------- */
5002
5003
PyObject *
5004
PyUnicode_DecodeUTF8(const char *s,
5005
                     Py_ssize_t size,
5006
                     const char *errors)
5007
2.48M
{
5008
2.48M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5009
2.48M
}
5010
5011
#include "stringlib/asciilib.h"
5012
#include "stringlib/codecs.h"
5013
#include "stringlib/undef.h"
5014
5015
#include "stringlib/ucs1lib.h"
5016
#include "stringlib/codecs.h"
5017
#include "stringlib/undef.h"
5018
5019
#include "stringlib/ucs2lib.h"
5020
#include "stringlib/codecs.h"
5021
#include "stringlib/undef.h"
5022
5023
#include "stringlib/ucs4lib.h"
5024
#include "stringlib/codecs.h"
5025
#include "stringlib/undef.h"
5026
5027
#if (SIZEOF_SIZE_T == 8)
5028
/* Mask to quickly check whether a C 'size_t' contains a
5029
   non-ASCII, UTF8-encoded char. */
5030
101M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
5031
// used to count codepoints in UTF-8 string.
5032
237M
# define VECTOR_0101     0x0101010101010101ULL
5033
1.90M
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
5034
#elif (SIZEOF_SIZE_T == 4)
5035
# define ASCII_CHAR_MASK 0x80808080U
5036
# define VECTOR_0101     0x01010101U
5037
# define VECTOR_00FF     0x00ff00ffU
5038
#else
5039
# error C 'size_t' size should be either 4 or 8!
5040
#endif
5041
5042
#if (defined(__clang__) || defined(__GNUC__))
5043
#define HAVE_CTZ 1
5044
static inline unsigned int
5045
ctz(size_t v)
5046
378k
{
5047
378k
    return __builtin_ctzll((unsigned long long)v);
5048
378k
}
5049
#elif defined(_MSC_VER)
5050
#define HAVE_CTZ 1
5051
static inline unsigned int
5052
ctz(size_t v)
5053
{
5054
    unsigned long pos;
5055
#if SIZEOF_SIZE_T == 4
5056
    _BitScanForward(&pos, v);
5057
#else
5058
    _BitScanForward64(&pos, v);
5059
#endif /* SIZEOF_SIZE_T */
5060
    return pos;
5061
}
5062
#else
5063
#define HAVE_CTZ 0
5064
#endif
5065
5066
#if HAVE_CTZ && PY_LITTLE_ENDIAN
5067
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
5068
static size_t
5069
load_unaligned(const unsigned char *p, size_t size)
5070
12.4M
{
5071
12.4M
    union {
5072
12.4M
        size_t s;
5073
12.4M
        unsigned char b[SIZEOF_SIZE_T];
5074
12.4M
    } u;
5075
12.4M
    u.s = 0;
5076
    // This switch statement assumes little endian because:
5077
    // * union is faster than bitwise or and shift.
5078
    // * big endian machine is rare and hard to maintain.
5079
12.4M
    switch (size) {
5080
0
    default:
5081
0
#if SIZEOF_SIZE_T == 8
5082
0
    case 8:
5083
0
        u.b[7] = p[7];
5084
0
        _Py_FALLTHROUGH;
5085
710k
    case 7:
5086
710k
        u.b[6] = p[6];
5087
710k
        _Py_FALLTHROUGH;
5088
2.70M
    case 6:
5089
2.70M
        u.b[5] = p[5];
5090
2.70M
        _Py_FALLTHROUGH;
5091
3.27M
    case 5:
5092
3.27M
        u.b[4] = p[4];
5093
3.27M
        _Py_FALLTHROUGH;
5094
3.27M
#endif
5095
3.74M
    case 4:
5096
3.74M
        u.b[3] = p[3];
5097
3.74M
        _Py_FALLTHROUGH;
5098
9.07M
    case 3:
5099
9.07M
        u.b[2] = p[2];
5100
9.07M
        _Py_FALLTHROUGH;
5101
11.9M
    case 2:
5102
11.9M
        u.b[1] = p[1];
5103
11.9M
        _Py_FALLTHROUGH;
5104
12.1M
    case 1:
5105
12.1M
        u.b[0] = p[0];
5106
12.1M
        break;
5107
201k
    case 0:
5108
201k
        break;
5109
12.4M
    }
5110
12.4M
    return u.s;
5111
12.4M
}
5112
#endif
5113
5114
/*
5115
 * Find the first non-ASCII character in a byte sequence.
5116
 *
5117
 * This function scans a range of bytes from `start` to `end` and returns the
5118
 * index of the first byte that is not an ASCII character (i.e., has the most
5119
 * significant bit set). If all characters in the range are ASCII, it returns
5120
 * `end - start`.
5121
 */
5122
static Py_ssize_t
5123
find_first_nonascii(const unsigned char *start, const unsigned char *end)
5124
12.6M
{
5125
    // The search is done in `size_t` chunks.
5126
    // The start and end might not be aligned at `size_t` boundaries,
5127
    // so they're handled specially.
5128
5129
12.6M
    const unsigned char *p = start;
5130
5131
12.6M
    if (end - start >= SIZEOF_SIZE_T) {
5132
        // Avoid unaligned read.
5133
3.07M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5134
3.07M
        size_t u;
5135
3.07M
        memcpy(&u, p, sizeof(size_t));
5136
3.07M
        u &= ASCII_CHAR_MASK;
5137
3.07M
        if (u) {
5138
128k
            return (ctz(u) - 7) / 8;
5139
128k
        }
5140
2.94M
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
5141
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
5142
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
5143
        while (p < p2) {
5144
            if (*p & 0x80) {
5145
                return p - start;
5146
            }
5147
            p++;
5148
        }
5149
#endif
5150
5151
2.94M
        const unsigned char *e = end - SIZEOF_SIZE_T;
5152
88.0M
        while (p <= e) {
5153
85.2M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5154
85.2M
            if (u) {
5155
89.0k
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5156
89.0k
                return p - start + (ctz(u) - 7) / 8;
5157
#else
5158
                // big endian and minor compilers are difficult to test.
5159
                // fallback to per byte check.
5160
                break;
5161
#endif
5162
89.0k
            }
5163
85.1M
            p += SIZEOF_SIZE_T;
5164
85.1M
        }
5165
2.94M
    }
5166
12.4M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5167
12.4M
    assert((end - p) < SIZEOF_SIZE_T);
5168
    // we can not use *(const size_t*)p to avoid buffer overrun.
5169
12.4M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5170
12.4M
    if (u) {
5171
161k
        return p - start + (ctz(u) - 7) / 8;
5172
161k
    }
5173
12.2M
    return end - start;
5174
#else
5175
    while (p < end) {
5176
        if (*p & 0x80) {
5177
            break;
5178
        }
5179
        p++;
5180
    }
5181
    return p - start;
5182
#endif
5183
12.4M
}
5184
5185
static inline int
5186
scalar_utf8_start_char(unsigned int ch)
5187
350k
{
5188
    // 0xxxxxxx or 11xxxxxx are first byte.
5189
350k
    return (~ch >> 7 | ch >> 6) & 1;
5190
350k
}
5191
5192
static inline size_t
5193
vector_utf8_start_chars(size_t v)
5194
237M
{
5195
237M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5196
237M
}
5197
5198
5199
// Count the number of UTF-8 code points in a given byte sequence.
5200
static Py_ssize_t
5201
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5202
95.8k
{
5203
95.8k
    Py_ssize_t len = 0;
5204
5205
95.8k
    if (end - s >= SIZEOF_SIZE_T) {
5206
49.0k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5207
20.3k
            len += scalar_utf8_start_char(*s++);
5208
20.3k
        }
5209
5210
982k
        while (s + SIZEOF_SIZE_T <= end) {
5211
953k
            const unsigned char *e = end;
5212
953k
            if (e - s > SIZEOF_SIZE_T * 255) {
5213
927k
                e = s + SIZEOF_SIZE_T * 255;
5214
927k
            }
5215
953k
            Py_ssize_t vstart = 0;
5216
238M
            while (s + SIZEOF_SIZE_T <= e) {
5217
237M
                size_t v = *(size_t*)s;
5218
237M
                size_t vs = vector_utf8_start_chars(v);
5219
237M
                vstart += vs;
5220
237M
                s += SIZEOF_SIZE_T;
5221
237M
            }
5222
953k
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5223
953k
            vstart += vstart >> 16;
5224
953k
#if SIZEOF_SIZE_T == 8
5225
953k
            vstart += vstart >> 32;
5226
953k
#endif
5227
953k
            len += vstart & 0x7ff;
5228
953k
        }
5229
28.6k
    }
5230
426k
    while (s < end) {
5231
330k
        len += scalar_utf8_start_char(*s++);
5232
330k
    }
5233
95.8k
    return len;
5234
95.8k
}
5235
5236
static Py_ssize_t
5237
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5238
5.53M
{
5239
5.53M
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5240
5.53M
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5241
5.53M
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5242
742k
    {
5243
        /* Fast path, see in STRINGLIB(utf8_decode) for
5244
           an explanation. */
5245
742k
        const char *p = start;
5246
742k
        Py_UCS1 *q = dest;
5247
1.64M
        while (p + SIZEOF_SIZE_T <= end) {
5248
1.02M
            size_t value = *(const size_t *) p;
5249
1.02M
            if (value & ASCII_CHAR_MASK)
5250
117k
                break;
5251
906k
            *((size_t *)q) = value;
5252
906k
            p += SIZEOF_SIZE_T;
5253
906k
            q += SIZEOF_SIZE_T;
5254
906k
        }
5255
3.32M
        while (p < end) {
5256
2.72M
            if ((unsigned char)*p & 0x80)
5257
139k
                break;
5258
2.58M
            *q++ = *p++;
5259
2.58M
        }
5260
742k
        return p - start;
5261
742k
    }
5262
4.79M
#endif
5263
4.79M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5264
4.79M
                                         (const unsigned char*)end);
5265
4.79M
    memcpy(dest, start, pos);
5266
4.79M
    return pos;
5267
5.53M
}
5268
5269
static int
5270
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5271
                         const char *starts, const char *s, const char *end,
5272
                         _Py_error_handler error_handler,
5273
                         const char *errors,
5274
                         Py_ssize_t *consumed)
5275
380k
{
5276
380k
    Py_ssize_t startinpos, endinpos;
5277
380k
    const char *errmsg = "";
5278
380k
    PyObject *error_handler_obj = NULL;
5279
380k
    PyObject *exc = NULL;
5280
5281
152M
    while (s < end) {
5282
152M
        Py_UCS4 ch;
5283
152M
        int kind = writer->kind;
5284
5285
152M
        if (kind == PyUnicode_1BYTE_KIND) {
5286
370k
            if (PyUnicode_IS_ASCII(writer->buffer))
5287
282k
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5288
87.6k
            else
5289
87.6k
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5290
152M
        } else if (kind == PyUnicode_2BYTE_KIND) {
5291
71.8M
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5292
80.4M
        } else {
5293
80.4M
            assert(kind == PyUnicode_4BYTE_KIND);
5294
80.4M
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5295
80.4M
        }
5296
5297
152M
        switch (ch) {
5298
316k
        case 0:
5299
316k
            if (s == end || consumed)
5300
294k
                goto End;
5301
21.6k
            errmsg = "unexpected end of data";
5302
21.6k
            startinpos = s - starts;
5303
21.6k
            endinpos = end - starts;
5304
21.6k
            break;
5305
120M
        case 1:
5306
120M
            errmsg = "invalid start byte";
5307
120M
            startinpos = s - starts;
5308
120M
            endinpos = startinpos + 1;
5309
120M
            break;
5310
30.0M
        case 2:
5311
30.0M
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5312
30.0M
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5313
0
            {
5314
                /* Truncated surrogate code in range D800-DFFF */
5315
0
                goto End;
5316
0
            }
5317
30.0M
            _Py_FALLTHROUGH;
5318
31.1M
        case 3:
5319
31.3M
        case 4:
5320
31.3M
            errmsg = "invalid continuation byte";
5321
31.3M
            startinpos = s - starts;
5322
31.3M
            endinpos = startinpos + ch - 1;
5323
31.3M
            break;
5324
276k
        default:
5325
            // ch doesn't fit into kind, so change the buffer kind to write
5326
            // the character
5327
276k
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5328
0
                goto onError;
5329
276k
            continue;
5330
152M
        }
5331
5332
152M
        if (error_handler == _Py_ERROR_UNKNOWN)
5333
104k
            error_handler = _Py_GetErrorHandler(errors);
5334
5335
152M
        switch (error_handler) {
5336
0
        case _Py_ERROR_IGNORE:
5337
0
            s += (endinpos - startinpos);
5338
0
            break;
5339
5340
152M
        case _Py_ERROR_REPLACE:
5341
152M
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5342
0
                goto onError;
5343
152M
            s += (endinpos - startinpos);
5344
152M
            break;
5345
5346
3.00k
        case _Py_ERROR_SURROGATEESCAPE:
5347
3.00k
        {
5348
3.00k
            Py_ssize_t i;
5349
5350
3.00k
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5351
0
                goto onError;
5352
6.38k
            for (i=startinpos; i<endinpos; i++) {
5353
3.38k
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5354
3.38k
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5355
3.38k
                                ch + 0xdc00);
5356
3.38k
                writer->pos++;
5357
3.38k
            }
5358
3.00k
            s += (endinpos - startinpos);
5359
3.00k
            break;
5360
3.00k
        }
5361
5362
3.36k
        default:
5363
3.36k
            if (unicode_decode_call_errorhandler_writer(
5364
3.36k
                    errors, &error_handler_obj,
5365
3.36k
                    "utf-8", errmsg,
5366
3.36k
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5367
3.36k
                    writer)) {
5368
3.36k
                goto onError;
5369
3.36k
            }
5370
5371
0
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5372
0
                return -1;
5373
0
            }
5374
152M
        }
5375
152M
    }
5376
5377
377k
End:
5378
377k
    if (consumed)
5379
1.69k
        *consumed = s - starts;
5380
5381
377k
    Py_XDECREF(error_handler_obj);
5382
377k
    Py_XDECREF(exc);
5383
377k
    return 0;
5384
5385
3.36k
onError:
5386
3.36k
    Py_XDECREF(error_handler_obj);
5387
3.36k
    Py_XDECREF(exc);
5388
3.36k
    return -1;
5389
380k
}
5390
5391
5392
static PyObject *
5393
unicode_decode_utf8(const char *s, Py_ssize_t size,
5394
                    _Py_error_handler error_handler, const char *errors,
5395
                    Py_ssize_t *consumed)
5396
9.69M
{
5397
9.69M
    if (size == 0) {
5398
65.3k
        if (consumed) {
5399
0
            *consumed = 0;
5400
0
        }
5401
65.3k
        _Py_RETURN_UNICODE_EMPTY();
5402
65.3k
    }
5403
5404
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5405
9.63M
    if (size == 1 && (unsigned char)s[0] < 128) {
5406
1.81M
        if (consumed) {
5407
0
            *consumed = 1;
5408
0
        }
5409
1.81M
        return get_latin1_char((unsigned char)s[0]);
5410
1.81M
    }
5411
5412
    // I don't know this check is necessary or not. But there is a test
5413
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5414
7.82M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5415
0
        PyErr_NoMemory();
5416
0
        return NULL;
5417
0
    }
5418
5419
7.82M
    const char *starts = s;
5420
7.82M
    const char *end = s + size;
5421
5422
7.82M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5423
7.82M
    if (pos == size) {  // fast path: ASCII string.
5424
7.49M
        PyObject *u = PyUnicode_New(size, 127);
5425
7.49M
        if (u == NULL) {
5426
0
            return NULL;
5427
0
        }
5428
7.49M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5429
7.49M
        if (consumed) {
5430
0
            *consumed = size;
5431
0
        }
5432
7.49M
        return u;
5433
7.49M
    }
5434
5435
327k
    int maxchr = 127;
5436
327k
    Py_ssize_t maxsize = size;
5437
5438
327k
    unsigned char ch = (unsigned char)(s[pos]);
5439
    // error handler other than strict may remove/replace the invalid byte.
5440
    // consumed != NULL allows 1~3 bytes remainings.
5441
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5442
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5443
    // reallocation and copy.
5444
327k
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5445
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5446
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5447
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5448
        // means that it is no longer necessary to allocate several times the required amount
5449
        // of memory.
5450
95.8k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5451
95.8k
        if (ch < 0xc4) { // latin1
5452
17.2k
            maxchr = 0xff;
5453
17.2k
        }
5454
78.5k
        else if (ch < 0xf0) { // ucs2
5455
69.2k
            maxchr = 0xffff;
5456
69.2k
        }
5457
9.26k
        else { // ucs4
5458
9.26k
            maxchr = 0x10ffff;
5459
9.26k
        }
5460
95.8k
    }
5461
327k
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5462
327k
    if (!u) {
5463
0
        return NULL;
5464
0
    }
5465
5466
    // Use _PyUnicodeWriter after fast path is failed.
5467
327k
    _PyUnicodeWriter writer;
5468
327k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5469
327k
    if (maxchr <= 255) {
5470
248k
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5471
248k
        s += pos;
5472
248k
        size -= pos;
5473
248k
        writer.pos = pos;
5474
248k
    }
5475
5476
327k
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5477
327k
                                 error_handler, errors,
5478
327k
                                 consumed) < 0) {
5479
3.36k
        _PyUnicodeWriter_Dealloc(&writer);
5480
3.36k
        return NULL;
5481
3.36k
    }
5482
323k
    return _PyUnicodeWriter_Finish(&writer);
5483
327k
}
5484
5485
5486
// Used by PyUnicodeWriter_WriteUTF8() implementation
5487
static int
5488
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
5489
                           const char *s, Py_ssize_t size,
5490
                           _Py_error_handler error_handler, const char *errors,
5491
                           Py_ssize_t *consumed)
5492
4.80M
{
5493
4.80M
    if (size == 0) {
5494
7.39k
        if (consumed) {
5495
0
            *consumed = 0;
5496
0
        }
5497
7.39k
        return 0;
5498
7.39k
    }
5499
5500
    // fast path: try ASCII string.
5501
4.79M
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5502
0
        return -1;
5503
0
    }
5504
5505
4.79M
    const char *starts = s;
5506
4.79M
    const char *end = s + size;
5507
4.79M
    Py_ssize_t decoded = 0;
5508
4.79M
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5509
4.79M
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5510
4.79M
        decoded = ascii_decode(s, end, dest);
5511
4.79M
        writer->pos += decoded;
5512
5513
4.79M
        if (decoded == size) {
5514
4.74M
            if (consumed) {
5515
1.49k
                *consumed = size;
5516
1.49k
            }
5517
4.74M
            return 0;
5518
4.74M
        }
5519
51.3k
        s += decoded;
5520
51.3k
        size -= decoded;
5521
51.3k
    }
5522
5523
53.6k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5524
53.6k
                                    error_handler, errors, consumed);
5525
4.79M
}
5526
5527
5528
PyObject *
5529
PyUnicode_DecodeUTF8Stateful(const char *s,
5530
                             Py_ssize_t size,
5531
                             const char *errors,
5532
                             Py_ssize_t *consumed)
5533
9.69M
{
5534
9.69M
    return unicode_decode_utf8(s, size,
5535
9.69M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5536
9.69M
                               errors, consumed);
5537
9.69M
}
5538
5539
5540
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5541
   non-zero, use strict error handler otherwise.
5542
5543
   On success, write a pointer to a newly allocated wide character string into
5544
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5545
   (in number of wchar_t units) into *wlen (if wlen is set).
5546
5547
   On memory allocation failure, return -1.
5548
5549
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5550
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5551
   is not NULL, write the decoding error message into *reason. */
5552
int
5553
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5554
                 const char **reason, _Py_error_handler errors)
5555
0
{
5556
0
    const char *orig_s = s;
5557
0
    const char *e;
5558
0
    wchar_t *unicode;
5559
0
    Py_ssize_t outpos;
5560
5561
0
    int surrogateescape = 0;
5562
0
    int surrogatepass = 0;
5563
0
    switch (errors)
5564
0
    {
5565
0
    case _Py_ERROR_STRICT:
5566
0
        break;
5567
0
    case _Py_ERROR_SURROGATEESCAPE:
5568
0
        surrogateescape = 1;
5569
0
        break;
5570
0
    case _Py_ERROR_SURROGATEPASS:
5571
0
        surrogatepass = 1;
5572
0
        break;
5573
0
    default:
5574
0
        return -3;
5575
0
    }
5576
5577
    /* Note: size will always be longer than the resulting Unicode
5578
       character count */
5579
0
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5580
0
        return -1;
5581
0
    }
5582
5583
0
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5584
0
    if (!unicode) {
5585
0
        return -1;
5586
0
    }
5587
5588
    /* Unpack UTF-8 encoded data */
5589
0
    e = s + size;
5590
0
    outpos = 0;
5591
0
    while (s < e) {
5592
0
        Py_UCS4 ch;
5593
0
#if SIZEOF_WCHAR_T == 4
5594
0
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5595
#else
5596
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5597
#endif
5598
0
        if (ch > 0xFF) {
5599
0
#if SIZEOF_WCHAR_T == 4
5600
0
            Py_UNREACHABLE();
5601
#else
5602
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5603
            /* write a surrogate pair */
5604
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5605
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5606
#endif
5607
0
        }
5608
0
        else {
5609
0
            if (!ch && s == e) {
5610
0
                break;
5611
0
            }
5612
5613
0
            if (surrogateescape) {
5614
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5615
0
            }
5616
0
            else {
5617
                /* Is it a valid three-byte code? */
5618
0
                if (surrogatepass
5619
0
                    && (e - s) >= 3
5620
0
                    && (s[0] & 0xf0) == 0xe0
5621
0
                    && (s[1] & 0xc0) == 0x80
5622
0
                    && (s[2] & 0xc0) == 0x80)
5623
0
                {
5624
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5625
0
                    s += 3;
5626
0
                    unicode[outpos++] = ch;
5627
0
                }
5628
0
                else {
5629
0
                    PyMem_RawFree(unicode );
5630
0
                    if (reason != NULL) {
5631
0
                        switch (ch) {
5632
0
                        case 0:
5633
0
                            *reason = "unexpected end of data";
5634
0
                            break;
5635
0
                        case 1:
5636
0
                            *reason = "invalid start byte";
5637
0
                            break;
5638
                        /* 2, 3, 4 */
5639
0
                        default:
5640
0
                            *reason = "invalid continuation byte";
5641
0
                            break;
5642
0
                        }
5643
0
                    }
5644
0
                    if (wlen != NULL) {
5645
0
                        *wlen = s - orig_s;
5646
0
                    }
5647
0
                    return -2;
5648
0
                }
5649
0
            }
5650
0
        }
5651
0
    }
5652
0
    unicode[outpos] = L'\0';
5653
0
    if (wlen) {
5654
0
        *wlen = outpos;
5655
0
    }
5656
0
    *wstr = unicode;
5657
0
    return 0;
5658
0
}
5659
5660
5661
wchar_t*
5662
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5663
                               size_t *wlen)
5664
0
{
5665
0
    wchar_t *wstr;
5666
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5667
0
                               &wstr, wlen,
5668
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5669
0
    if (res != 0) {
5670
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5671
0
        assert(res != -3);
5672
0
        if (wlen) {
5673
0
            *wlen = (size_t)res;
5674
0
        }
5675
0
        return NULL;
5676
0
    }
5677
0
    return wstr;
5678
0
}
5679
5680
5681
/* UTF-8 encoder.
5682
5683
   On success, return 0 and write the newly allocated character string (use
5684
   PyMem_Free() to free the memory) into *str.
5685
5686
   On encoding failure, return -2 and write the position of the invalid
5687
   surrogate character into *error_pos (if error_pos is set) and the decoding
5688
   error message into *reason (if reason is set).
5689
5690
   On memory allocation failure, return -1. */
5691
int
5692
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5693
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5694
64
{
5695
64
    const Py_ssize_t max_char_size = 4;
5696
64
    Py_ssize_t len = wcslen(text);
5697
5698
64
    assert(len >= 0);
5699
5700
64
    int surrogateescape = 0;
5701
64
    int surrogatepass = 0;
5702
64
    switch (errors)
5703
64
    {
5704
64
    case _Py_ERROR_STRICT:
5705
64
        break;
5706
0
    case _Py_ERROR_SURROGATEESCAPE:
5707
0
        surrogateescape = 1;
5708
0
        break;
5709
0
    case _Py_ERROR_SURROGATEPASS:
5710
0
        surrogatepass = 1;
5711
0
        break;
5712
0
    default:
5713
0
        return -3;
5714
64
    }
5715
5716
64
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5717
0
        return -1;
5718
0
    }
5719
64
    char *bytes;
5720
64
    if (raw_malloc) {
5721
64
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5722
64
    }
5723
0
    else {
5724
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5725
0
    }
5726
64
    if (bytes == NULL) {
5727
0
        return -1;
5728
0
    }
5729
5730
64
    char *p = bytes;
5731
64
    Py_ssize_t i;
5732
832
    for (i = 0; i < len; ) {
5733
768
        Py_ssize_t ch_pos = i;
5734
768
        Py_UCS4 ch = text[i];
5735
768
        i++;
5736
#if Py_UNICODE_SIZE == 2
5737
        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5738
            && i < len
5739
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5740
        {
5741
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5742
            i++;
5743
        }
5744
#endif
5745
5746
768
        if (ch < 0x80) {
5747
            /* Encode ASCII */
5748
768
            *p++ = (char) ch;
5749
5750
768
        }
5751
0
        else if (ch < 0x0800) {
5752
            /* Encode Latin-1 */
5753
0
            *p++ = (char)(0xc0 | (ch >> 6));
5754
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5755
0
        }
5756
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5757
            /* surrogateescape error handler */
5758
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5759
0
                if (error_pos != NULL) {
5760
0
                    *error_pos = (size_t)ch_pos;
5761
0
                }
5762
0
                if (reason != NULL) {
5763
0
                    *reason = "encoding error";
5764
0
                }
5765
0
                if (raw_malloc) {
5766
0
                    PyMem_RawFree(bytes);
5767
0
                }
5768
0
                else {
5769
0
                    PyMem_Free(bytes);
5770
0
                }
5771
0
                return -2;
5772
0
            }
5773
0
            *p++ = (char)(ch & 0xff);
5774
0
        }
5775
0
        else if (ch < 0x10000) {
5776
0
            *p++ = (char)(0xe0 | (ch >> 12));
5777
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5778
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5779
0
        }
5780
0
        else {  /* ch >= 0x10000 */
5781
0
            assert(ch <= MAX_UNICODE);
5782
            /* Encode UCS4 Unicode ordinals */
5783
0
            *p++ = (char)(0xf0 | (ch >> 18));
5784
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5785
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5786
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5787
0
        }
5788
768
    }
5789
64
    *p++ = '\0';
5790
5791
64
    size_t final_size = (p - bytes);
5792
64
    char *bytes2;
5793
64
    if (raw_malloc) {
5794
64
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5795
64
    }
5796
0
    else {
5797
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5798
0
    }
5799
64
    if (bytes2 == NULL) {
5800
0
        if (error_pos != NULL) {
5801
0
            *error_pos = (size_t)-1;
5802
0
        }
5803
0
        if (raw_malloc) {
5804
0
            PyMem_RawFree(bytes);
5805
0
        }
5806
0
        else {
5807
0
            PyMem_Free(bytes);
5808
0
        }
5809
0
        return -1;
5810
0
    }
5811
64
    *str = bytes2;
5812
64
    return 0;
5813
64
}
5814
5815
5816
/* Primary internal function which creates utf8 encoded bytes objects.
5817
5818
   Allocation strategy:  if the string is short, convert into a stack buffer
5819
   and allocate exactly as much space needed at the end.  Else allocate the
5820
   maximum possible needed (4 result bytes per Unicode character), and return
5821
   the excess memory at the end.
5822
*/
5823
static PyObject *
5824
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5825
                    const char *errors)
5826
12.9M
{
5827
12.9M
    if (!PyUnicode_Check(unicode)) {
5828
0
        PyErr_BadArgument();
5829
0
        return NULL;
5830
0
    }
5831
5832
12.9M
    if (PyUnicode_UTF8(unicode))
5833
7.36M
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5834
7.36M
                                         PyUnicode_UTF8_LENGTH(unicode));
5835
5836
5.62M
    int kind = PyUnicode_KIND(unicode);
5837
5.62M
    const void *data = PyUnicode_DATA(unicode);
5838
5.62M
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5839
5840
5.62M
    _PyBytesWriter writer;
5841
5.62M
    char *end;
5842
5843
5.62M
    switch (kind) {
5844
0
    default:
5845
0
        Py_UNREACHABLE();
5846
4.21M
    case PyUnicode_1BYTE_KIND:
5847
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5848
4.21M
        assert(!PyUnicode_IS_ASCII(unicode));
5849
4.21M
        end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5850
4.21M
        break;
5851
1.34M
    case PyUnicode_2BYTE_KIND:
5852
1.34M
        end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5853
1.34M
        break;
5854
61.9k
    case PyUnicode_4BYTE_KIND:
5855
61.9k
        end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5856
61.9k
        break;
5857
5.62M
    }
5858
5859
5.62M
    if (end == NULL) {
5860
143k
        _PyBytesWriter_Dealloc(&writer);
5861
143k
        return NULL;
5862
143k
    }
5863
5.47M
    return _PyBytesWriter_Finish(&writer, end);
5864
5.62M
}
5865
5866
static int
5867
unicode_fill_utf8(PyObject *unicode)
5868
121k
{
5869
121k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5870
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5871
121k
    assert(!PyUnicode_IS_ASCII(unicode));
5872
5873
121k
    int kind = PyUnicode_KIND(unicode);
5874
121k
    const void *data = PyUnicode_DATA(unicode);
5875
121k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5876
5877
121k
    _PyBytesWriter writer;
5878
121k
    char *end;
5879
5880
121k
    switch (kind) {
5881
0
    default:
5882
0
        Py_UNREACHABLE();
5883
94.6k
    case PyUnicode_1BYTE_KIND:
5884
94.6k
        end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5885
94.6k
                                   _Py_ERROR_STRICT, NULL);
5886
94.6k
        break;
5887
22.1k
    case PyUnicode_2BYTE_KIND:
5888
22.1k
        end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5889
22.1k
                                   _Py_ERROR_STRICT, NULL);
5890
22.1k
        break;
5891
4.73k
    case PyUnicode_4BYTE_KIND:
5892
4.73k
        end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5893
4.73k
                                   _Py_ERROR_STRICT, NULL);
5894
4.73k
        break;
5895
121k
    }
5896
121k
    if (end == NULL) {
5897
274
        _PyBytesWriter_Dealloc(&writer);
5898
274
        return -1;
5899
274
    }
5900
5901
121k
    const char *start = writer.use_small_buffer ? writer.small_buffer :
5902
121k
                    PyBytes_AS_STRING(writer.buffer);
5903
121k
    Py_ssize_t len = end - start;
5904
5905
121k
    char *cache = PyMem_Malloc(len + 1);
5906
121k
    if (cache == NULL) {
5907
0
        _PyBytesWriter_Dealloc(&writer);
5908
0
        PyErr_NoMemory();
5909
0
        return -1;
5910
0
    }
5911
121k
    memcpy(cache, start, len);
5912
121k
    cache[len] = '\0';
5913
121k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5914
121k
    PyUnicode_SET_UTF8(unicode, cache);
5915
121k
    _PyBytesWriter_Dealloc(&writer);
5916
121k
    return 0;
5917
121k
}
5918
5919
PyObject *
5920
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5921
12.9M
{
5922
12.9M
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5923
12.9M
}
5924
5925
5926
PyObject *
5927
PyUnicode_AsUTF8String(PyObject *unicode)
5928
2.92k
{
5929
2.92k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5930
2.92k
}
5931
5932
/* --- UTF-32 Codec ------------------------------------------------------- */
5933
5934
PyObject *
5935
PyUnicode_DecodeUTF32(const char *s,
5936
                      Py_ssize_t size,
5937
                      const char *errors,
5938
                      int *byteorder)
5939
47
{
5940
47
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5941
47
}
5942
5943
PyObject *
5944
PyUnicode_DecodeUTF32Stateful(const char *s,
5945
                              Py_ssize_t size,
5946
                              const char *errors,
5947
                              int *byteorder,
5948
                              Py_ssize_t *consumed)
5949
39.2k
{
5950
39.2k
    const char *starts = s;
5951
39.2k
    Py_ssize_t startinpos;
5952
39.2k
    Py_ssize_t endinpos;
5953
39.2k
    _PyUnicodeWriter writer;
5954
39.2k
    const unsigned char *q, *e;
5955
39.2k
    int le, bo = 0;       /* assume native ordering by default */
5956
39.2k
    const char *encoding;
5957
39.2k
    const char *errmsg = "";
5958
39.2k
    PyObject *errorHandler = NULL;
5959
39.2k
    PyObject *exc = NULL;
5960
5961
39.2k
    q = (const unsigned char *)s;
5962
39.2k
    e = q + size;
5963
5964
39.2k
    if (byteorder)
5965
39.2k
        bo = *byteorder;
5966
5967
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5968
       byte order setting accordingly. In native mode, the leading BOM
5969
       mark is skipped, in all other modes, it is copied to the output
5970
       stream as-is (giving a ZWNBSP character). */
5971
39.2k
    if (bo == 0 && size >= 4) {
5972
37.6k
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5973
37.6k
        if (bom == 0x0000FEFF) {
5974
158
            bo = -1;
5975
158
            q += 4;
5976
158
        }
5977
37.4k
        else if (bom == 0xFFFE0000) {
5978
158
            bo = 1;
5979
158
            q += 4;
5980
158
        }
5981
37.6k
        if (byteorder)
5982
37.5k
            *byteorder = bo;
5983
37.6k
    }
5984
5985
39.2k
    if (q == e) {
5986
73
        if (consumed)
5987
0
            *consumed = size;
5988
73
        _Py_RETURN_UNICODE_EMPTY();
5989
73
    }
5990
5991
#ifdef WORDS_BIGENDIAN
5992
    le = bo < 0;
5993
#else
5994
39.1k
    le = bo <= 0;
5995
39.1k
#endif
5996
39.1k
    encoding = le ? "utf-32-le" : "utf-32-be";
5997
5998
39.1k
    _PyUnicodeWriter_Init(&writer);
5999
39.1k
    writer.min_length = (e - q + 3) / 4;
6000
39.1k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6001
0
        goto onError;
6002
6003
101k
    while (1) {
6004
101k
        Py_UCS4 ch = 0;
6005
101k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
6006
6007
101k
        if (e - q >= 4) {
6008
79.1k
            int kind = writer.kind;
6009
79.1k
            void *data = writer.data;
6010
79.1k
            const unsigned char *last = e - 4;
6011
79.1k
            Py_ssize_t pos = writer.pos;
6012
79.1k
            if (le) {
6013
95.6k
                do {
6014
95.6k
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
6015
95.6k
                    if (ch > maxch)
6016
75.3k
                        break;
6017
20.2k
                    if (kind != PyUnicode_1BYTE_KIND &&
6018
20.2k
                        Py_UNICODE_IS_SURROGATE(ch))
6019
87
                        break;
6020
20.2k
                    PyUnicode_WRITE(kind, data, pos++, ch);
6021
20.2k
                    q += 4;
6022
20.2k
                } while (q <= last);
6023
76.3k
            }
6024
2.78k
            else {
6025
4.75k
                do {
6026
4.75k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
6027
4.75k
                    if (ch > maxch)
6028
2.61k
                        break;
6029
2.14k
                    if (kind != PyUnicode_1BYTE_KIND &&
6030
2.14k
                        Py_UNICODE_IS_SURROGATE(ch))
6031
84
                        break;
6032
2.05k
                    PyUnicode_WRITE(kind, data, pos++, ch);
6033
2.05k
                    q += 4;
6034
2.05k
                } while (q <= last);
6035
2.78k
            }
6036
0
            writer.pos = pos;
6037
79.1k
        }
6038
6039
101k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
6040
175
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
6041
175
            startinpos = ((const char *)q) - starts;
6042
175
            endinpos = startinpos + 4;
6043
175
        }
6044
101k
        else if (ch <= maxch) {
6045
23.0k
            if (q == e || consumed)
6046
2.35k
                break;
6047
            /* remaining bytes at the end? (size should be divisible by 4) */
6048
20.7k
            errmsg = "truncated data";
6049
20.7k
            startinpos = ((const char *)q) - starts;
6050
20.7k
            endinpos = ((const char *)e) - starts;
6051
20.7k
        }
6052
77.9k
        else {
6053
77.9k
            if (ch < 0x110000) {
6054
3.83k
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6055
0
                    goto onError;
6056
3.83k
                q += 4;
6057
3.83k
                continue;
6058
3.83k
            }
6059
74.1k
            errmsg = "code point not in range(0x110000)";
6060
74.1k
            startinpos = ((const char *)q) - starts;
6061
74.1k
            endinpos = startinpos + 4;
6062
74.1k
        }
6063
6064
        /* The remaining input chars are ignored if the callback
6065
           chooses to skip the input */
6066
95.0k
        if (unicode_decode_call_errorhandler_writer(
6067
95.0k
                errors, &errorHandler,
6068
95.0k
                encoding, errmsg,
6069
95.0k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
6070
95.0k
                &writer))
6071
36.8k
            goto onError;
6072
95.0k
    }
6073
6074
2.35k
    if (consumed)
6075
0
        *consumed = (const char *)q-starts;
6076
6077
2.35k
    Py_XDECREF(errorHandler);
6078
2.35k
    Py_XDECREF(exc);
6079
2.35k
    return _PyUnicodeWriter_Finish(&writer);
6080
6081
36.8k
  onError:
6082
36.8k
    _PyUnicodeWriter_Dealloc(&writer);
6083
36.8k
    Py_XDECREF(errorHandler);
6084
36.8k
    Py_XDECREF(exc);
6085
36.8k
    return NULL;
6086
39.1k
}
6087
6088
PyObject *
6089
_PyUnicode_EncodeUTF32(PyObject *str,
6090
                       const char *errors,
6091
                       int byteorder)
6092
0
{
6093
0
    int kind;
6094
0
    const void *data;
6095
0
    Py_ssize_t len;
6096
0
    PyObject *v;
6097
0
    uint32_t *out;
6098
0
#if PY_LITTLE_ENDIAN
6099
0
    int native_ordering = byteorder <= 0;
6100
#else
6101
    int native_ordering = byteorder >= 0;
6102
#endif
6103
0
    const char *encoding;
6104
0
    Py_ssize_t nsize, pos;
6105
0
    PyObject *errorHandler = NULL;
6106
0
    PyObject *exc = NULL;
6107
0
    PyObject *rep = NULL;
6108
6109
0
    if (!PyUnicode_Check(str)) {
6110
0
        PyErr_BadArgument();
6111
0
        return NULL;
6112
0
    }
6113
0
    kind = PyUnicode_KIND(str);
6114
0
    data = PyUnicode_DATA(str);
6115
0
    len = PyUnicode_GET_LENGTH(str);
6116
6117
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
6118
0
        return PyErr_NoMemory();
6119
0
    nsize = len + (byteorder == 0);
6120
0
    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
6121
0
    if (v == NULL)
6122
0
        return NULL;
6123
6124
    /* output buffer is 4-bytes aligned */
6125
0
    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
6126
0
    out = (uint32_t *)PyBytes_AS_STRING(v);
6127
0
    if (byteorder == 0)
6128
0
        *out++ = 0xFEFF;
6129
0
    if (len == 0)
6130
0
        goto done;
6131
6132
0
    if (byteorder == -1)
6133
0
        encoding = "utf-32-le";
6134
0
    else if (byteorder == 1)
6135
0
        encoding = "utf-32-be";
6136
0
    else
6137
0
        encoding = "utf-32";
6138
6139
0
    if (kind == PyUnicode_1BYTE_KIND) {
6140
0
        ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6141
0
        goto done;
6142
0
    }
6143
6144
0
    pos = 0;
6145
0
    while (pos < len) {
6146
0
        Py_ssize_t newpos, repsize, moreunits;
6147
6148
0
        if (kind == PyUnicode_2BYTE_KIND) {
6149
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6150
0
                                        &out, native_ordering);
6151
0
        }
6152
0
        else {
6153
0
            assert(kind == PyUnicode_4BYTE_KIND);
6154
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6155
0
                                        &out, native_ordering);
6156
0
        }
6157
0
        if (pos == len)
6158
0
            break;
6159
6160
0
        rep = unicode_encode_call_errorhandler(
6161
0
                errors, &errorHandler,
6162
0
                encoding, "surrogates not allowed",
6163
0
                str, &exc, pos, pos + 1, &newpos);
6164
0
        if (!rep)
6165
0
            goto error;
6166
6167
0
        if (PyBytes_Check(rep)) {
6168
0
            repsize = PyBytes_GET_SIZE(rep);
6169
0
            if (repsize & 3) {
6170
0
                raise_encode_exception(&exc, encoding,
6171
0
                                       str, pos, pos + 1,
6172
0
                                       "surrogates not allowed");
6173
0
                goto error;
6174
0
            }
6175
0
            moreunits = repsize / 4;
6176
0
        }
6177
0
        else {
6178
0
            assert(PyUnicode_Check(rep));
6179
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6180
0
            if (!PyUnicode_IS_ASCII(rep)) {
6181
0
                raise_encode_exception(&exc, encoding,
6182
0
                                       str, pos, pos + 1,
6183
0
                                       "surrogates not allowed");
6184
0
                goto error;
6185
0
            }
6186
0
        }
6187
0
        moreunits += pos - newpos;
6188
0
        pos = newpos;
6189
6190
        /* four bytes are reserved for each surrogate */
6191
0
        if (moreunits > 0) {
6192
0
            Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
6193
0
            if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
6194
                /* integer overflow */
6195
0
                PyErr_NoMemory();
6196
0
                goto error;
6197
0
            }
6198
0
            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
6199
0
                goto error;
6200
0
            out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
6201
0
        }
6202
6203
0
        if (PyBytes_Check(rep)) {
6204
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6205
0
            out += repsize / 4;
6206
0
        } else /* rep is unicode */ {
6207
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6208
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6209
0
                                 &out, native_ordering);
6210
0
        }
6211
6212
0
        Py_CLEAR(rep);
6213
0
    }
6214
6215
    /* Cut back to size actually needed. This is necessary for, for example,
6216
       encoding of a string containing isolated surrogates and the 'ignore'
6217
       handler is used. */
6218
0
    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6219
0
    if (nsize != PyBytes_GET_SIZE(v))
6220
0
      _PyBytes_Resize(&v, nsize);
6221
0
    Py_XDECREF(errorHandler);
6222
0
    Py_XDECREF(exc);
6223
0
  done:
6224
0
    return v;
6225
0
  error:
6226
0
    Py_XDECREF(rep);
6227
0
    Py_XDECREF(errorHandler);
6228
0
    Py_XDECREF(exc);
6229
0
    Py_XDECREF(v);
6230
0
    return NULL;
6231
0
}
6232
6233
PyObject *
6234
PyUnicode_AsUTF32String(PyObject *unicode)
6235
0
{
6236
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6237
0
}
6238
6239
/* --- UTF-16 Codec ------------------------------------------------------- */
6240
6241
PyObject *
6242
PyUnicode_DecodeUTF16(const char *s,
6243
                      Py_ssize_t size,
6244
                      const char *errors,
6245
                      int *byteorder)
6246
103
{
6247
103
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6248
103
}
6249
6250
PyObject *
6251
PyUnicode_DecodeUTF16Stateful(const char *s,
6252
                              Py_ssize_t size,
6253
                              const char *errors,
6254
                              int *byteorder,
6255
                              Py_ssize_t *consumed)
6256
12.8k
{
6257
12.8k
    const char *starts = s;
6258
12.8k
    Py_ssize_t startinpos;
6259
12.8k
    Py_ssize_t endinpos;
6260
12.8k
    _PyUnicodeWriter writer;
6261
12.8k
    const unsigned char *q, *e;
6262
12.8k
    int bo = 0;       /* assume native ordering by default */
6263
12.8k
    int native_ordering;
6264
12.8k
    const char *errmsg = "";
6265
12.8k
    PyObject *errorHandler = NULL;
6266
12.8k
    PyObject *exc = NULL;
6267
12.8k
    const char *encoding;
6268
6269
12.8k
    q = (const unsigned char *)s;
6270
12.8k
    e = q + size;
6271
6272
12.8k
    if (byteorder)
6273
12.7k
        bo = *byteorder;
6274
6275
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6276
       byte order setting accordingly. In native mode, the leading BOM
6277
       mark is skipped, in all other modes, it is copied to the output
6278
       stream as-is (giving a ZWNBSP character). */
6279
12.8k
    if (bo == 0 && size >= 2) {
6280
12.1k
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6281
12.1k
        if (bom == 0xFEFF) {
6282
300
            q += 2;
6283
300
            bo = -1;
6284
300
        }
6285
11.8k
        else if (bom == 0xFFFE) {
6286
1.24k
            q += 2;
6287
1.24k
            bo = 1;
6288
1.24k
        }
6289
12.1k
        if (byteorder)
6290
12.0k
            *byteorder = bo;
6291
12.1k
    }
6292
6293
12.8k
    if (q == e) {
6294
44
        if (consumed)
6295
0
            *consumed = size;
6296
44
        _Py_RETURN_UNICODE_EMPTY();
6297
44
    }
6298
6299
12.7k
#if PY_LITTLE_ENDIAN
6300
12.7k
    native_ordering = bo <= 0;
6301
12.7k
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6302
#else
6303
    native_ordering = bo >= 0;
6304
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6305
#endif
6306
6307
    /* Note: size will always be longer than the resulting Unicode
6308
       character count normally.  Error handler will take care of
6309
       resizing when needed. */
6310
12.7k
    _PyUnicodeWriter_Init(&writer);
6311
12.7k
    writer.min_length = (e - q + 1) / 2;
6312
12.7k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6313
0
        goto onError;
6314
6315
46.3k
    while (1) {
6316
46.3k
        Py_UCS4 ch = 0;
6317
46.3k
        if (e - q >= 2) {
6318
38.6k
            int kind = writer.kind;
6319
38.6k
            if (kind == PyUnicode_1BYTE_KIND) {
6320
14.4k
                if (PyUnicode_IS_ASCII(writer.buffer))
6321
12.2k
                    ch = asciilib_utf16_decode(&q, e,
6322
12.2k
                            (Py_UCS1*)writer.data, &writer.pos,
6323
12.2k
                            native_ordering);
6324
2.16k
                else
6325
2.16k
                    ch = ucs1lib_utf16_decode(&q, e,
6326
2.16k
                            (Py_UCS1*)writer.data, &writer.pos,
6327
2.16k
                            native_ordering);
6328
24.1k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6329
8.75k
                ch = ucs2lib_utf16_decode(&q, e,
6330
8.75k
                        (Py_UCS2*)writer.data, &writer.pos,
6331
8.75k
                        native_ordering);
6332
15.3k
            } else {
6333
15.3k
                assert(kind == PyUnicode_4BYTE_KIND);
6334
15.3k
                ch = ucs4lib_utf16_decode(&q, e,
6335
15.3k
                        (Py_UCS4*)writer.data, &writer.pos,
6336
15.3k
                        native_ordering);
6337
15.3k
            }
6338
38.6k
        }
6339
6340
46.3k
        switch (ch)
6341
46.3k
        {
6342
12.7k
        case 0:
6343
            /* remaining byte at the end? (size should be even) */
6344
12.7k
            if (q == e || consumed)
6345
8.79k
                goto End;
6346
3.93k
            errmsg = "truncated data";
6347
3.93k
            startinpos = ((const char *)q) - starts;
6348
3.93k
            endinpos = ((const char *)e) - starts;
6349
3.93k
            break;
6350
            /* The remaining input chars are ignored if the callback
6351
               chooses to skip the input */
6352
828
        case 1:
6353
828
            q -= 2;
6354
828
            if (consumed)
6355
0
                goto End;
6356
828
            errmsg = "unexpected end of data";
6357
828
            startinpos = ((const char *)q) - starts;
6358
828
            endinpos = ((const char *)e) - starts;
6359
828
            break;
6360
11.2k
        case 2:
6361
11.2k
            errmsg = "illegal encoding";
6362
11.2k
            startinpos = ((const char *)q) - 2 - starts;
6363
11.2k
            endinpos = startinpos + 2;
6364
11.2k
            break;
6365
8.13k
        case 3:
6366
8.13k
            errmsg = "illegal UTF-16 surrogate";
6367
8.13k
            startinpos = ((const char *)q) - 4 - starts;
6368
8.13k
            endinpos = startinpos + 2;
6369
8.13k
            break;
6370
13.4k
        default:
6371
13.4k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6372
0
                goto onError;
6373
13.4k
            continue;
6374
46.3k
        }
6375
6376
24.1k
        if (unicode_decode_call_errorhandler_writer(
6377
24.1k
                errors,
6378
24.1k
                &errorHandler,
6379
24.1k
                encoding, errmsg,
6380
24.1k
                &starts,
6381
24.1k
                (const char **)&e,
6382
24.1k
                &startinpos,
6383
24.1k
                &endinpos,
6384
24.1k
                &exc,
6385
24.1k
                (const char **)&q,
6386
24.1k
                &writer))
6387
3.98k
            goto onError;
6388
24.1k
    }
6389
6390
8.79k
End:
6391
8.79k
    if (consumed)
6392
0
        *consumed = (const char *)q-starts;
6393
6394
8.79k
    Py_XDECREF(errorHandler);
6395
8.79k
    Py_XDECREF(exc);
6396
8.79k
    return _PyUnicodeWriter_Finish(&writer);
6397
6398
3.98k
  onError:
6399
3.98k
    _PyUnicodeWriter_Dealloc(&writer);
6400
3.98k
    Py_XDECREF(errorHandler);
6401
3.98k
    Py_XDECREF(exc);
6402
3.98k
    return NULL;
6403
12.7k
}
6404
6405
PyObject *
6406
_PyUnicode_EncodeUTF16(PyObject *str,
6407
                       const char *errors,
6408
                       int byteorder)
6409
0
{
6410
0
    int kind;
6411
0
    const void *data;
6412
0
    Py_ssize_t len;
6413
0
    PyObject *v;
6414
0
    unsigned short *out;
6415
0
    Py_ssize_t pairs;
6416
#if PY_BIG_ENDIAN
6417
    int native_ordering = byteorder >= 0;
6418
#else
6419
0
    int native_ordering = byteorder <= 0;
6420
0
#endif
6421
0
    const char *encoding;
6422
0
    Py_ssize_t nsize, pos;
6423
0
    PyObject *errorHandler = NULL;
6424
0
    PyObject *exc = NULL;
6425
0
    PyObject *rep = NULL;
6426
6427
0
    if (!PyUnicode_Check(str)) {
6428
0
        PyErr_BadArgument();
6429
0
        return NULL;
6430
0
    }
6431
0
    kind = PyUnicode_KIND(str);
6432
0
    data = PyUnicode_DATA(str);
6433
0
    len = PyUnicode_GET_LENGTH(str);
6434
6435
0
    pairs = 0;
6436
0
    if (kind == PyUnicode_4BYTE_KIND) {
6437
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6438
0
        const Py_UCS4 *end = in + len;
6439
0
        while (in < end) {
6440
0
            if (*in++ >= 0x10000) {
6441
0
                pairs++;
6442
0
            }
6443
0
        }
6444
0
    }
6445
0
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6446
0
        return PyErr_NoMemory();
6447
0
    }
6448
0
    nsize = len + pairs + (byteorder == 0);
6449
0
    v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6450
0
    if (v == NULL) {
6451
0
        return NULL;
6452
0
    }
6453
6454
    /* output buffer is 2-bytes aligned */
6455
0
    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6456
0
    out = (unsigned short *)PyBytes_AS_STRING(v);
6457
0
    if (byteorder == 0) {
6458
0
        *out++ = 0xFEFF;
6459
0
    }
6460
0
    if (len == 0) {
6461
0
        goto done;
6462
0
    }
6463
6464
0
    if (kind == PyUnicode_1BYTE_KIND) {
6465
0
        ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6466
0
        goto done;
6467
0
    }
6468
6469
0
    if (byteorder < 0) {
6470
0
        encoding = "utf-16-le";
6471
0
    }
6472
0
    else if (byteorder > 0) {
6473
0
        encoding = "utf-16-be";
6474
0
    }
6475
0
    else {
6476
0
        encoding = "utf-16";
6477
0
    }
6478
6479
0
    pos = 0;
6480
0
    while (pos < len) {
6481
0
        Py_ssize_t newpos, repsize, moreunits;
6482
6483
0
        if (kind == PyUnicode_2BYTE_KIND) {
6484
0
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6485
0
                                        &out, native_ordering);
6486
0
        }
6487
0
        else {
6488
0
            assert(kind == PyUnicode_4BYTE_KIND);
6489
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6490
0
                                        &out, native_ordering);
6491
0
        }
6492
0
        if (pos == len)
6493
0
            break;
6494
6495
0
        rep = unicode_encode_call_errorhandler(
6496
0
                errors, &errorHandler,
6497
0
                encoding, "surrogates not allowed",
6498
0
                str, &exc, pos, pos + 1, &newpos);
6499
0
        if (!rep)
6500
0
            goto error;
6501
6502
0
        if (PyBytes_Check(rep)) {
6503
0
            repsize = PyBytes_GET_SIZE(rep);
6504
0
            if (repsize & 1) {
6505
0
                raise_encode_exception(&exc, encoding,
6506
0
                                       str, pos, pos + 1,
6507
0
                                       "surrogates not allowed");
6508
0
                goto error;
6509
0
            }
6510
0
            moreunits = repsize / 2;
6511
0
        }
6512
0
        else {
6513
0
            assert(PyUnicode_Check(rep));
6514
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6515
0
            if (!PyUnicode_IS_ASCII(rep)) {
6516
0
                raise_encode_exception(&exc, encoding,
6517
0
                                       str, pos, pos + 1,
6518
0
                                       "surrogates not allowed");
6519
0
                goto error;
6520
0
            }
6521
0
        }
6522
0
        moreunits += pos - newpos;
6523
0
        pos = newpos;
6524
6525
        /* two bytes are reserved for each surrogate */
6526
0
        if (moreunits > 0) {
6527
0
            Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6528
0
            if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
6529
                /* integer overflow */
6530
0
                PyErr_NoMemory();
6531
0
                goto error;
6532
0
            }
6533
0
            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
6534
0
                goto error;
6535
0
            out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6536
0
        }
6537
6538
0
        if (PyBytes_Check(rep)) {
6539
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6540
0
            out += repsize / 2;
6541
0
        } else /* rep is unicode */ {
6542
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6543
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6544
0
                                 &out, native_ordering);
6545
0
        }
6546
6547
0
        Py_CLEAR(rep);
6548
0
    }
6549
6550
    /* Cut back to size actually needed. This is necessary for, for example,
6551
    encoding of a string containing isolated surrogates and the 'ignore' handler
6552
    is used. */
6553
0
    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6554
0
    if (nsize != PyBytes_GET_SIZE(v))
6555
0
      _PyBytes_Resize(&v, nsize);
6556
0
    Py_XDECREF(errorHandler);
6557
0
    Py_XDECREF(exc);
6558
0
  done:
6559
0
    return v;
6560
0
  error:
6561
0
    Py_XDECREF(rep);
6562
0
    Py_XDECREF(errorHandler);
6563
0
    Py_XDECREF(exc);
6564
0
    Py_XDECREF(v);
6565
0
    return NULL;
6566
0
#undef STORECHAR
6567
0
}
6568
6569
PyObject *
6570
PyUnicode_AsUTF16String(PyObject *unicode)
6571
0
{
6572
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6573
0
}
6574
6575
_PyUnicode_Name_CAPI *
6576
_PyUnicode_GetNameCAPI(void)
6577
2.74k
{
6578
2.74k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6579
2.74k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6580
6581
2.74k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6582
2.74k
    if (ucnhash_capi == NULL) {
6583
1
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6584
1
                PyUnicodeData_CAPSULE_NAME, 1);
6585
6586
        // It's fine if we overwrite the value here. It's always the same value.
6587
1
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6588
1
    }
6589
2.74k
    return ucnhash_capi;
6590
2.74k
}
6591
6592
/* --- Unicode Escape Codec ----------------------------------------------- */
6593
6594
PyObject *
6595
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6596
                               Py_ssize_t size,
6597
                               const char *errors,
6598
                               Py_ssize_t *consumed,
6599
                               int *first_invalid_escape_char,
6600
                               const char **first_invalid_escape_ptr)
6601
31.6k
{
6602
31.6k
    const char *starts = s;
6603
31.6k
    const char *initial_starts = starts;
6604
31.6k
    _PyUnicodeWriter writer;
6605
31.6k
    const char *end;
6606
31.6k
    PyObject *errorHandler = NULL;
6607
31.6k
    PyObject *exc = NULL;
6608
31.6k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6609
6610
    // so we can remember if we've seen an invalid escape char or not
6611
31.6k
    *first_invalid_escape_char = -1;
6612
31.6k
    *first_invalid_escape_ptr = NULL;
6613
6614
31.6k
    if (size == 0) {
6615
2.28k
        if (consumed) {
6616
0
            *consumed = 0;
6617
0
        }
6618
2.28k
        _Py_RETURN_UNICODE_EMPTY();
6619
2.28k
    }
6620
    /* Escaped strings will always be longer than the resulting
6621
       Unicode string, so we start with size here and then reduce the
6622
       length after conversion to the true value.
6623
       (but if the error callback returns a long replacement string
6624
       we'll have to allocate more space) */
6625
29.3k
    _PyUnicodeWriter_Init(&writer);
6626
29.3k
    writer.min_length = size;
6627
29.3k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6628
0
        goto onError;
6629
0
    }
6630
6631
29.3k
    end = s + size;
6632
193k
    while (s < end) {
6633
164k
        unsigned char c = (unsigned char) *s++;
6634
164k
        Py_UCS4 ch;
6635
164k
        int count;
6636
164k
        const char *message;
6637
6638
164k
#define WRITE_ASCII_CHAR(ch)                                                  \
6639
164k
            do {                                                              \
6640
14.6k
                assert(ch <= 127);                                            \
6641
14.6k
                assert(writer.pos < writer.size);                             \
6642
14.6k
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6643
14.6k
            } while(0)
6644
6645
164k
#define WRITE_CHAR(ch)                                                        \
6646
164k
            do {                                                              \
6647
153k
                if (ch <= writer.maxchar) {                                   \
6648
137k
                    assert(writer.pos < writer.size);                         \
6649
137k
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6650
137k
                }                                                             \
6651
153k
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6652
0
                    goto onError;                                             \
6653
0
                }                                                             \
6654
153k
            } while(0)
6655
6656
        /* Non-escape characters are interpreted as Unicode ordinals */
6657
164k
        if (c != '\\') {
6658
105k
            WRITE_CHAR(c);
6659
105k
            continue;
6660
105k
        }
6661
6662
58.9k
        Py_ssize_t startinpos = s - starts - 1;
6663
        /* \ - Escapes */
6664
58.9k
        if (s >= end) {
6665
0
            message = "\\ at end of string";
6666
0
            goto incomplete;
6667
0
        }
6668
58.9k
        c = (unsigned char) *s++;
6669
6670
58.9k
        assert(writer.pos < writer.size);
6671
58.9k
        switch (c) {
6672
6673
            /* \x escapes */
6674
1.08k
        case '\n': continue;
6675
1.52k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6676
946
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6677
1.16k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6678
606
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6679
        /* FF */
6680
771
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6681
840
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6682
911
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6683
1.99k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6684
        /* VT */
6685
696
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6686
        /* BEL, not classic C */
6687
664
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6688
6689
            /* \OOO (octal) escapes */
6690
3.46k
        case '0': case '1': case '2': case '3':
6691
7.46k
        case '4': case '5': case '6': case '7':
6692
7.46k
            ch = c - '0';
6693
7.46k
            if (s < end && '0' <= *s && *s <= '7') {
6694
3.58k
                ch = (ch<<3) + *s++ - '0';
6695
3.58k
                if (s < end && '0' <= *s && *s <= '7') {
6696
1.90k
                    ch = (ch<<3) + *s++ - '0';
6697
1.90k
                }
6698
3.58k
            }
6699
7.46k
            if (ch > 0377) {
6700
1.54k
                if (*first_invalid_escape_char == -1) {
6701
1.00k
                    *first_invalid_escape_char = ch;
6702
1.00k
                    if (starts == initial_starts) {
6703
                        /* Back up 3 chars, since we've already incremented s. */
6704
1.00k
                        *first_invalid_escape_ptr = s - 3;
6705
1.00k
                    }
6706
1.00k
                }
6707
1.54k
            }
6708
7.46k
            WRITE_CHAR(ch);
6709
7.46k
            continue;
6710
6711
            /* hex escapes */
6712
            /* \xXX */
6713
7.46k
        case 'x':
6714
6.01k
            count = 2;
6715
6.01k
            message = "truncated \\xXX escape";
6716
6.01k
            goto hexescape;
6717
6718
            /* \uXXXX */
6719
8.37k
        case 'u':
6720
8.37k
            count = 4;
6721
8.37k
            message = "truncated \\uXXXX escape";
6722
8.37k
            goto hexescape;
6723
6724
            /* \UXXXXXXXX */
6725
18.6k
        case 'U':
6726
18.6k
            count = 8;
6727
18.6k
            message = "truncated \\UXXXXXXXX escape";
6728
33.0k
        hexescape:
6729
227k
            for (ch = 0; count; ++s, --count) {
6730
194k
                if (s >= end) {
6731
12
                    goto incomplete;
6732
12
                }
6733
194k
                c = (unsigned char)*s;
6734
194k
                ch <<= 4;
6735
194k
                if (c >= '0' && c <= '9') {
6736
145k
                    ch += c - '0';
6737
145k
                }
6738
48.8k
                else if (c >= 'a' && c <= 'f') {
6739
48.6k
                    ch += c - ('a' - 10);
6740
48.6k
                }
6741
220
                else if (c >= 'A' && c <= 'F') {
6742
216
                    ch += c - ('A' - 10);
6743
216
                }
6744
4
                else {
6745
4
                    goto error;
6746
4
                }
6747
194k
            }
6748
6749
            /* when we get here, ch is a 32-bit unicode character */
6750
32.9k
            if (ch > MAX_UNICODE) {
6751
1
                message = "illegal Unicode character";
6752
1
                goto error;
6753
1
            }
6754
6755
32.9k
            WRITE_CHAR(ch);
6756
32.9k
            continue;
6757
6758
            /* \N{name} */
6759
32.9k
        case 'N':
6760
2.74k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6761
2.74k
            if (ucnhash_capi == NULL) {
6762
0
                PyErr_SetString(
6763
0
                        PyExc_UnicodeError,
6764
0
                        "\\N escapes not supported (can't load unicodedata module)"
6765
0
                );
6766
0
                goto onError;
6767
0
            }
6768
6769
2.74k
            message = "malformed \\N character escape";
6770
2.74k
            if (s >= end) {
6771
4
                goto incomplete;
6772
4
            }
6773
2.73k
            if (*s == '{') {
6774
2.73k
                const char *start = ++s;
6775
2.73k
                size_t namelen;
6776
                /* look for the closing brace */
6777
55.5k
                while (s < end && *s != '}')
6778
52.7k
                    s++;
6779
2.73k
                if (s >= end) {
6780
11
                    goto incomplete;
6781
11
                }
6782
2.72k
                namelen = s - start;
6783
2.72k
                if (namelen) {
6784
                    /* found a name.  look it up in the unicode database */
6785
2.72k
                    s++;
6786
2.72k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6787
2.72k
                    if (namelen <= INT_MAX &&
6788
2.72k
                        ucnhash_capi->getcode(start, (int)namelen,
6789
2.72k
                                              &ch, 0)) {
6790
2.63k
                        assert(ch <= MAX_UNICODE);
6791
2.63k
                        WRITE_CHAR(ch);
6792
2.63k
                        continue;
6793
2.63k
                    }
6794
84
                    message = "unknown Unicode character name";
6795
84
                }
6796
2.72k
            }
6797
90
            goto error;
6798
6799
4.51k
        default:
6800
4.51k
            if (*first_invalid_escape_char == -1) {
6801
3.01k
                *first_invalid_escape_char = c;
6802
3.01k
                if (starts == initial_starts) {
6803
                    /* Back up one char, since we've already incremented s. */
6804
3.01k
                    *first_invalid_escape_ptr = s - 1;
6805
3.01k
                }
6806
3.01k
            }
6807
4.51k
            WRITE_ASCII_CHAR('\\');
6808
4.51k
            WRITE_CHAR(c);
6809
4.51k
            continue;
6810
58.9k
        }
6811
6812
27
      incomplete:
6813
27
        if (consumed) {
6814
0
            *consumed = startinpos;
6815
0
            break;
6816
0
        }
6817
122
      error:;
6818
122
        Py_ssize_t endinpos = s-starts;
6819
122
        writer.min_length = end - s + writer.pos;
6820
122
        if (unicode_decode_call_errorhandler_writer(
6821
122
                errors, &errorHandler,
6822
122
                "unicodeescape", message,
6823
122
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6824
122
                &writer)) {
6825
122
            goto onError;
6826
122
        }
6827
0
        assert(end - s <= writer.size - writer.pos);
6828
6829
0
#undef WRITE_ASCII_CHAR
6830
0
#undef WRITE_CHAR
6831
0
    }
6832
6833
29.2k
    Py_XDECREF(errorHandler);
6834
29.2k
    Py_XDECREF(exc);
6835
29.2k
    return _PyUnicodeWriter_Finish(&writer);
6836
6837
122
  onError:
6838
122
    _PyUnicodeWriter_Dealloc(&writer);
6839
122
    Py_XDECREF(errorHandler);
6840
122
    Py_XDECREF(exc);
6841
122
    return NULL;
6842
29.3k
}
6843
6844
PyObject *
6845
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6846
                              Py_ssize_t size,
6847
                              const char *errors,
6848
                              Py_ssize_t *consumed)
6849
0
{
6850
0
    int first_invalid_escape_char;
6851
0
    const char *first_invalid_escape_ptr;
6852
0
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6853
0
                                                      consumed,
6854
0
                                                      &first_invalid_escape_char,
6855
0
                                                      &first_invalid_escape_ptr);
6856
0
    if (result == NULL)
6857
0
        return NULL;
6858
0
    if (first_invalid_escape_char != -1) {
6859
0
        if (first_invalid_escape_char > 0xff) {
6860
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6861
0
                                 "\"\\%o\" is an invalid octal escape sequence. "
6862
0
                                 "Such sequences will not work in the future. ",
6863
0
                                 first_invalid_escape_char) < 0)
6864
0
            {
6865
0
                Py_DECREF(result);
6866
0
                return NULL;
6867
0
            }
6868
0
        }
6869
0
        else {
6870
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6871
0
                                 "\"\\%c\" is an invalid escape sequence. "
6872
0
                                 "Such sequences will not work in the future. ",
6873
0
                                 first_invalid_escape_char) < 0)
6874
0
            {
6875
0
                Py_DECREF(result);
6876
0
                return NULL;
6877
0
            }
6878
0
        }
6879
0
    }
6880
0
    return result;
6881
0
}
6882
6883
PyObject *
6884
PyUnicode_DecodeUnicodeEscape(const char *s,
6885
                              Py_ssize_t size,
6886
                              const char *errors)
6887
0
{
6888
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6889
0
}
6890
6891
/* Return a Unicode-Escape string version of the Unicode object. */
6892
6893
PyObject *
6894
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6895
520k
{
6896
520k
    Py_ssize_t i, len;
6897
520k
    PyObject *repr;
6898
520k
    char *p;
6899
520k
    int kind;
6900
520k
    const void *data;
6901
520k
    Py_ssize_t expandsize;
6902
6903
    /* Initial allocation is based on the longest-possible character
6904
       escape.
6905
6906
       For UCS1 strings it's '\xxx', 4 bytes per source character.
6907
       For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6908
       For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6909
    */
6910
6911
520k
    if (!PyUnicode_Check(unicode)) {
6912
0
        PyErr_BadArgument();
6913
0
        return NULL;
6914
0
    }
6915
6916
520k
    len = PyUnicode_GET_LENGTH(unicode);
6917
520k
    if (len == 0) {
6918
0
        return PyBytes_FromStringAndSize(NULL, 0);
6919
0
    }
6920
6921
520k
    kind = PyUnicode_KIND(unicode);
6922
520k
    data = PyUnicode_DATA(unicode);
6923
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6924
       bytes, and 1 byte characters 4. */
6925
520k
    expandsize = kind * 2 + 2;
6926
520k
    if (len > PY_SSIZE_T_MAX / expandsize) {
6927
0
        return PyErr_NoMemory();
6928
0
    }
6929
520k
    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6930
520k
    if (repr == NULL) {
6931
0
        return NULL;
6932
0
    }
6933
6934
520k
    p = PyBytes_AS_STRING(repr);
6935
1.04M
    for (i = 0; i < len; i++) {
6936
520k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6937
6938
        /* U+0000-U+00ff range */
6939
520k
        if (ch < 0x100) {
6940
512k
            if (ch >= ' ' && ch < 127) {
6941
41.3k
                if (ch != '\\') {
6942
                    /* Copy printable US ASCII as-is */
6943
0
                    *p++ = (char) ch;
6944
0
                }
6945
                /* Escape backslashes */
6946
41.3k
                else {
6947
41.3k
                    *p++ = '\\';
6948
41.3k
                    *p++ = '\\';
6949
41.3k
                }
6950
41.3k
            }
6951
6952
            /* Map special whitespace to '\t', \n', '\r' */
6953
471k
            else if (ch == '\t') {
6954
3.62k
                *p++ = '\\';
6955
3.62k
                *p++ = 't';
6956
3.62k
            }
6957
467k
            else if (ch == '\n') {
6958
1.25k
                *p++ = '\\';
6959
1.25k
                *p++ = 'n';
6960
1.25k
            }
6961
466k
            else if (ch == '\r') {
6962
554
                *p++ = '\\';
6963
554
                *p++ = 'r';
6964
554
            }
6965
6966
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6967
465k
            else {
6968
465k
                *p++ = '\\';
6969
465k
                *p++ = 'x';
6970
465k
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6971
465k
                *p++ = Py_hexdigits[ch & 0x000F];
6972
465k
            }
6973
512k
        }
6974
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6975
7.94k
        else if (ch < 0x10000) {
6976
6.95k
            *p++ = '\\';
6977
6.95k
            *p++ = 'u';
6978
6.95k
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6979
6.95k
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6980
6.95k
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6981
6.95k
            *p++ = Py_hexdigits[ch & 0x000F];
6982
6.95k
        }
6983
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6984
993
        else {
6985
6986
            /* Make sure that the first two digits are zero */
6987
993
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6988
993
            *p++ = '\\';
6989
993
            *p++ = 'U';
6990
993
            *p++ = '0';
6991
993
            *p++ = '0';
6992
993
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6993
993
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6994
993
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6995
993
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6996
993
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6997
993
            *p++ = Py_hexdigits[ch & 0x0000000F];
6998
993
        }
6999
520k
    }
7000
7001
520k
    assert(p - PyBytes_AS_STRING(repr) > 0);
7002
520k
    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
7003
0
        return NULL;
7004
0
    }
7005
520k
    return repr;
7006
520k
}
7007
7008
/* --- Raw Unicode Escape Codec ------------------------------------------- */
7009
7010
PyObject *
7011
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
7012
                                          Py_ssize_t size,
7013
                                          const char *errors,
7014
                                          Py_ssize_t *consumed)
7015
0
{
7016
0
    const char *starts = s;
7017
0
    _PyUnicodeWriter writer;
7018
0
    const char *end;
7019
0
    PyObject *errorHandler = NULL;
7020
0
    PyObject *exc = NULL;
7021
7022
0
    if (size == 0) {
7023
0
        if (consumed) {
7024
0
            *consumed = 0;
7025
0
        }
7026
0
        _Py_RETURN_UNICODE_EMPTY();
7027
0
    }
7028
7029
    /* Escaped strings will always be longer than the resulting
7030
       Unicode string, so we start with size here and then reduce the
7031
       length after conversion to the true value. (But decoding error
7032
       handler might have to resize the string) */
7033
0
    _PyUnicodeWriter_Init(&writer);
7034
0
    writer.min_length = size;
7035
0
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
7036
0
        goto onError;
7037
0
    }
7038
7039
0
    end = s + size;
7040
0
    while (s < end) {
7041
0
        unsigned char c = (unsigned char) *s++;
7042
0
        Py_UCS4 ch;
7043
0
        int count;
7044
0
        const char *message;
7045
7046
0
#define WRITE_CHAR(ch)                                                        \
7047
0
            do {                                                              \
7048
0
                if (ch <= writer.maxchar) {                                   \
7049
0
                    assert(writer.pos < writer.size);                         \
7050
0
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
7051
0
                }                                                             \
7052
0
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
7053
0
                    goto onError;                                             \
7054
0
                }                                                             \
7055
0
            } while(0)
7056
7057
        /* Non-escape characters are interpreted as Unicode ordinals */
7058
0
        if (c != '\\' || (s >= end && !consumed)) {
7059
0
            WRITE_CHAR(c);
7060
0
            continue;
7061
0
        }
7062
7063
0
        Py_ssize_t startinpos = s - starts - 1;
7064
        /* \ - Escapes */
7065
0
        if (s >= end) {
7066
0
            assert(consumed);
7067
            // Set message to silent compiler warning.
7068
            // Actually it is never used.
7069
0
            message = "\\ at end of string";
7070
0
            goto incomplete;
7071
0
        }
7072
7073
0
        c = (unsigned char) *s++;
7074
0
        if (c == 'u') {
7075
0
            count = 4;
7076
0
            message = "truncated \\uXXXX escape";
7077
0
        }
7078
0
        else if (c == 'U') {
7079
0
            count = 8;
7080
0
            message = "truncated \\UXXXXXXXX escape";
7081
0
        }
7082
0
        else {
7083
0
            assert(writer.pos < writer.size);
7084
0
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
7085
0
            WRITE_CHAR(c);
7086
0
            continue;
7087
0
        }
7088
7089
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
7090
0
        for (ch = 0; count; ++s, --count) {
7091
0
            if (s >= end) {
7092
0
                goto incomplete;
7093
0
            }
7094
0
            c = (unsigned char)*s;
7095
0
            ch <<= 4;
7096
0
            if (c >= '0' && c <= '9') {
7097
0
                ch += c - '0';
7098
0
            }
7099
0
            else if (c >= 'a' && c <= 'f') {
7100
0
                ch += c - ('a' - 10);
7101
0
            }
7102
0
            else if (c >= 'A' && c <= 'F') {
7103
0
                ch += c - ('A' - 10);
7104
0
            }
7105
0
            else {
7106
0
                goto error;
7107
0
            }
7108
0
        }
7109
0
        if (ch > MAX_UNICODE) {
7110
0
            message = "\\Uxxxxxxxx out of range";
7111
0
            goto error;
7112
0
        }
7113
0
        WRITE_CHAR(ch);
7114
0
        continue;
7115
7116
0
      incomplete:
7117
0
        if (consumed) {
7118
0
            *consumed = startinpos;
7119
0
            break;
7120
0
        }
7121
0
      error:;
7122
0
        Py_ssize_t endinpos = s-starts;
7123
0
        writer.min_length = end - s + writer.pos;
7124
0
        if (unicode_decode_call_errorhandler_writer(
7125
0
                errors, &errorHandler,
7126
0
                "rawunicodeescape", message,
7127
0
                &starts, &end, &startinpos, &endinpos, &exc, &s,
7128
0
                &writer)) {
7129
0
            goto onError;
7130
0
        }
7131
0
        assert(end - s <= writer.size - writer.pos);
7132
7133
0
#undef WRITE_CHAR
7134
0
    }
7135
0
    Py_XDECREF(errorHandler);
7136
0
    Py_XDECREF(exc);
7137
0
    return _PyUnicodeWriter_Finish(&writer);
7138
7139
0
  onError:
7140
0
    _PyUnicodeWriter_Dealloc(&writer);
7141
0
    Py_XDECREF(errorHandler);
7142
0
    Py_XDECREF(exc);
7143
0
    return NULL;
7144
0
}
7145
7146
PyObject *
7147
PyUnicode_DecodeRawUnicodeEscape(const char *s,
7148
                                 Py_ssize_t size,
7149
                                 const char *errors)
7150
0
{
7151
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
7152
0
}
7153
7154
7155
PyObject *
7156
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7157
167k
{
7158
167k
    PyObject *repr;
7159
167k
    char *p;
7160
167k
    Py_ssize_t expandsize, pos;
7161
167k
    int kind;
7162
167k
    const void *data;
7163
167k
    Py_ssize_t len;
7164
7165
167k
    if (!PyUnicode_Check(unicode)) {
7166
0
        PyErr_BadArgument();
7167
0
        return NULL;
7168
0
    }
7169
167k
    kind = PyUnicode_KIND(unicode);
7170
167k
    data = PyUnicode_DATA(unicode);
7171
167k
    len = PyUnicode_GET_LENGTH(unicode);
7172
167k
    if (kind == PyUnicode_1BYTE_KIND) {
7173
167k
        return PyBytes_FromStringAndSize(data, len);
7174
167k
    }
7175
7176
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7177
       bytes, and 1 byte characters 4. */
7178
228
    expandsize = kind * 2 + 2;
7179
7180
228
    if (len > PY_SSIZE_T_MAX / expandsize) {
7181
0
        return PyErr_NoMemory();
7182
0
    }
7183
228
    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
7184
228
    if (repr == NULL) {
7185
0
        return NULL;
7186
0
    }
7187
228
    if (len == 0) {
7188
0
        return repr;
7189
0
    }
7190
7191
228
    p = PyBytes_AS_STRING(repr);
7192
5.14M
    for (pos = 0; pos < len; pos++) {
7193
5.14M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7194
7195
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7196
5.14M
        if (ch < 0x100) {
7197
5.08M
            *p++ = (char) ch;
7198
5.08M
        }
7199
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7200
58.9k
        else if (ch < 0x10000) {
7201
58.3k
            *p++ = '\\';
7202
58.3k
            *p++ = 'u';
7203
58.3k
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7204
58.3k
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7205
58.3k
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7206
58.3k
            *p++ = Py_hexdigits[ch & 15];
7207
58.3k
        }
7208
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7209
581
        else {
7210
581
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7211
581
            *p++ = '\\';
7212
581
            *p++ = 'U';
7213
581
            *p++ = '0';
7214
581
            *p++ = '0';
7215
581
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7216
581
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7217
581
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7218
581
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7219
581
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7220
581
            *p++ = Py_hexdigits[ch & 15];
7221
581
        }
7222
5.14M
    }
7223
7224
228
    assert(p > PyBytes_AS_STRING(repr));
7225
228
    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
7226
0
        return NULL;
7227
0
    }
7228
228
    return repr;
7229
228
}
7230
7231
/* --- Latin-1 Codec ------------------------------------------------------ */
7232
7233
PyObject *
7234
PyUnicode_DecodeLatin1(const char *s,
7235
                       Py_ssize_t size,
7236
                       const char *errors)
7237
1.81M
{
7238
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7239
1.81M
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7240
1.81M
}
7241
7242
/* create or adjust a UnicodeEncodeError */
7243
static void
7244
make_encode_exception(PyObject **exceptionObject,
7245
                      const char *encoding,
7246
                      PyObject *unicode,
7247
                      Py_ssize_t startpos, Py_ssize_t endpos,
7248
                      const char *reason)
7249
187k
{
7250
187k
    if (*exceptionObject == NULL) {
7251
187k
        *exceptionObject = PyObject_CallFunction(
7252
187k
            PyExc_UnicodeEncodeError, "sOnns",
7253
187k
            encoding, unicode, startpos, endpos, reason);
7254
187k
    }
7255
0
    else {
7256
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7257
0
            goto onError;
7258
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7259
0
            goto onError;
7260
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7261
0
            goto onError;
7262
0
        return;
7263
0
      onError:
7264
0
        Py_CLEAR(*exceptionObject);
7265
0
    }
7266
187k
}
7267
7268
/* raises a UnicodeEncodeError */
7269
static void
7270
raise_encode_exception(PyObject **exceptionObject,
7271
                       const char *encoding,
7272
                       PyObject *unicode,
7273
                       Py_ssize_t startpos, Py_ssize_t endpos,
7274
                       const char *reason)
7275
34.6k
{
7276
34.6k
    make_encode_exception(exceptionObject,
7277
34.6k
                          encoding, unicode, startpos, endpos, reason);
7278
34.6k
    if (*exceptionObject != NULL)
7279
34.6k
        PyCodec_StrictErrors(*exceptionObject);
7280
34.6k
}
7281
7282
/* error handling callback helper:
7283
   build arguments, call the callback and check the arguments,
7284
   put the result into newpos and return the replacement string, which
7285
   has to be freed by the caller */
7286
static PyObject *
7287
unicode_encode_call_errorhandler(const char *errors,
7288
                                 PyObject **errorHandler,
7289
                                 const char *encoding, const char *reason,
7290
                                 PyObject *unicode, PyObject **exceptionObject,
7291
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7292
                                 Py_ssize_t *newpos)
7293
152k
{
7294
152k
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7295
152k
    Py_ssize_t len;
7296
152k
    PyObject *restuple;
7297
152k
    PyObject *resunicode;
7298
7299
152k
    if (*errorHandler == NULL) {
7300
152k
        *errorHandler = PyCodec_LookupError(errors);
7301
152k
        if (*errorHandler == NULL)
7302
0
            return NULL;
7303
152k
    }
7304
7305
152k
    len = PyUnicode_GET_LENGTH(unicode);
7306
7307
152k
    make_encode_exception(exceptionObject,
7308
152k
                          encoding, unicode, startpos, endpos, reason);
7309
152k
    if (*exceptionObject == NULL)
7310
0
        return NULL;
7311
7312
152k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7313
152k
    if (restuple == NULL)
7314
152k
        return NULL;
7315
0
    if (!PyTuple_Check(restuple)) {
7316
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7317
0
        Py_DECREF(restuple);
7318
0
        return NULL;
7319
0
    }
7320
0
    if (!PyArg_ParseTuple(restuple, argparse,
7321
0
                          &resunicode, newpos)) {
7322
0
        Py_DECREF(restuple);
7323
0
        return NULL;
7324
0
    }
7325
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7326
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7327
0
        Py_DECREF(restuple);
7328
0
        return NULL;
7329
0
    }
7330
0
    if (*newpos<0)
7331
0
        *newpos = len + *newpos;
7332
0
    if (*newpos<0 || *newpos>len) {
7333
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7334
0
        Py_DECREF(restuple);
7335
0
        return NULL;
7336
0
    }
7337
0
    Py_INCREF(resunicode);
7338
0
    Py_DECREF(restuple);
7339
0
    return resunicode;
7340
0
}
7341
7342
static PyObject *
7343
unicode_encode_ucs1(PyObject *unicode,
7344
                    const char *errors,
7345
                    const Py_UCS4 limit)
7346
44.0k
{
7347
    /* input state */
7348
44.0k
    Py_ssize_t pos=0, size;
7349
44.0k
    int kind;
7350
44.0k
    const void *data;
7351
    /* pointer into the output */
7352
44.0k
    char *str;
7353
44.0k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7354
44.0k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7355
44.0k
    PyObject *error_handler_obj = NULL;
7356
44.0k
    PyObject *exc = NULL;
7357
44.0k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7358
44.0k
    PyObject *rep = NULL;
7359
    /* output object */
7360
44.0k
    _PyBytesWriter writer;
7361
7362
44.0k
    size = PyUnicode_GET_LENGTH(unicode);
7363
44.0k
    kind = PyUnicode_KIND(unicode);
7364
44.0k
    data = PyUnicode_DATA(unicode);
7365
    /* allocate enough for a simple encoding without
7366
       replacements, if we need more, we'll resize */
7367
44.0k
    if (size == 0)
7368
0
        return PyBytes_FromStringAndSize(NULL, 0);
7369
7370
44.0k
    _PyBytesWriter_Init(&writer);
7371
44.0k
    str = _PyBytesWriter_Alloc(&writer, size);
7372
44.0k
    if (str == NULL)
7373
0
        return NULL;
7374
7375
2.54M
    while (pos < size) {
7376
2.54M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7377
7378
        /* can we encode this? */
7379
2.54M
        if (ch < limit) {
7380
            /* no overflow check, because we know that the space is enough */
7381
2.50M
            *str++ = (char)ch;
7382
2.50M
            ++pos;
7383
2.50M
        }
7384
44.0k
        else {
7385
44.0k
            Py_ssize_t newpos, i;
7386
            /* startpos for collecting unencodable chars */
7387
44.0k
            Py_ssize_t collstart = pos;
7388
44.0k
            Py_ssize_t collend = collstart + 1;
7389
            /* find all unecodable characters */
7390
7391
300k
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7392
256k
                ++collend;
7393
7394
            /* Only overallocate the buffer if it's not the last write */
7395
44.0k
            writer.overallocate = (collend < size);
7396
7397
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7398
44.0k
            if (error_handler == _Py_ERROR_UNKNOWN)
7399
44.0k
                error_handler = _Py_GetErrorHandler(errors);
7400
7401
44.0k
            switch (error_handler) {
7402
34.6k
            case _Py_ERROR_STRICT:
7403
34.6k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7404
34.6k
                goto onError;
7405
7406
0
            case _Py_ERROR_REPLACE:
7407
0
                memset(str, '?', collend - collstart);
7408
0
                str += (collend - collstart);
7409
0
                _Py_FALLTHROUGH;
7410
0
            case _Py_ERROR_IGNORE:
7411
0
                pos = collend;
7412
0
                break;
7413
7414
0
            case _Py_ERROR_BACKSLASHREPLACE:
7415
                /* subtract preallocated bytes */
7416
0
                writer.min_size -= (collend - collstart);
7417
0
                str = backslashreplace(&writer, str,
7418
0
                                       unicode, collstart, collend);
7419
0
                if (str == NULL)
7420
0
                    goto onError;
7421
0
                pos = collend;
7422
0
                break;
7423
7424
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7425
                /* subtract preallocated bytes */
7426
0
                writer.min_size -= (collend - collstart);
7427
0
                str = xmlcharrefreplace(&writer, str,
7428
0
                                        unicode, collstart, collend);
7429
0
                if (str == NULL)
7430
0
                    goto onError;
7431
0
                pos = collend;
7432
0
                break;
7433
7434
9.36k
            case _Py_ERROR_SURROGATEESCAPE:
7435
9.36k
                for (i = collstart; i < collend; ++i) {
7436
9.36k
                    ch = PyUnicode_READ(kind, data, i);
7437
9.36k
                    if (ch < 0xdc80 || 0xdcff < ch) {
7438
                        /* Not a UTF-8b surrogate */
7439
9.36k
                        break;
7440
9.36k
                    }
7441
0
                    *str++ = (char)(ch - 0xdc00);
7442
0
                    ++pos;
7443
0
                }
7444
9.36k
                if (i >= collend)
7445
0
                    break;
7446
9.36k
                collstart = pos;
7447
9.36k
                assert(collstart != collend);
7448
9.36k
                _Py_FALLTHROUGH;
7449
7450
9.36k
            default:
7451
9.36k
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7452
9.36k
                                                       encoding, reason, unicode, &exc,
7453
9.36k
                                                       collstart, collend, &newpos);
7454
9.36k
                if (rep == NULL)
7455
9.36k
                    goto onError;
7456
7457
0
                if (newpos < collstart) {
7458
0
                    writer.overallocate = 1;
7459
0
                    str = _PyBytesWriter_Prepare(&writer, str,
7460
0
                                                 collstart - newpos);
7461
0
                    if (str == NULL)
7462
0
                        goto onError;
7463
0
                }
7464
0
                else {
7465
                    /* subtract preallocated bytes */
7466
0
                    writer.min_size -= newpos - collstart;
7467
                    /* Only overallocate the buffer if it's not the last write */
7468
0
                    writer.overallocate = (newpos < size);
7469
0
                }
7470
7471
0
                if (PyBytes_Check(rep)) {
7472
                    /* Directly copy bytes result to output. */
7473
0
                    str = _PyBytesWriter_WriteBytes(&writer, str,
7474
0
                                                    PyBytes_AS_STRING(rep),
7475
0
                                                    PyBytes_GET_SIZE(rep));
7476
0
                }
7477
0
                else {
7478
0
                    assert(PyUnicode_Check(rep));
7479
7480
0
                    if (limit == 256 ?
7481
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7482
0
                        !PyUnicode_IS_ASCII(rep))
7483
0
                    {
7484
                        /* Not all characters are smaller than limit */
7485
0
                        raise_encode_exception(&exc, encoding, unicode,
7486
0
                                               collstart, collend, reason);
7487
0
                        goto onError;
7488
0
                    }
7489
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7490
0
                    str = _PyBytesWriter_WriteBytes(&writer, str,
7491
0
                                                    PyUnicode_DATA(rep),
7492
0
                                                    PyUnicode_GET_LENGTH(rep));
7493
0
                }
7494
0
                if (str == NULL)
7495
0
                    goto onError;
7496
7497
0
                pos = newpos;
7498
0
                Py_CLEAR(rep);
7499
44.0k
            }
7500
7501
            /* If overallocation was disabled, ensure that it was the last
7502
               write. Otherwise, we missed an optimization */
7503
0
            assert(writer.overallocate || pos == size);
7504
0
        }
7505
2.54M
    }
7506
7507
0
    Py_XDECREF(error_handler_obj);
7508
0
    Py_XDECREF(exc);
7509
0
    return _PyBytesWriter_Finish(&writer, str);
7510
7511
44.0k
  onError:
7512
44.0k
    Py_XDECREF(rep);
7513
44.0k
    _PyBytesWriter_Dealloc(&writer);
7514
44.0k
    Py_XDECREF(error_handler_obj);
7515
44.0k
    Py_XDECREF(exc);
7516
44.0k
    return NULL;
7517
44.0k
}
7518
7519
PyObject *
7520
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7521
0
{
7522
0
    if (!PyUnicode_Check(unicode)) {
7523
0
        PyErr_BadArgument();
7524
0
        return NULL;
7525
0
    }
7526
    /* Fast path: if it is a one-byte string, construct
7527
       bytes object directly. */
7528
0
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7529
0
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7530
0
                                         PyUnicode_GET_LENGTH(unicode));
7531
    /* Non-Latin-1 characters present. Defer to above function to
7532
       raise the exception. */
7533
0
    return unicode_encode_ucs1(unicode, errors, 256);
7534
0
}
7535
7536
PyObject*
7537
PyUnicode_AsLatin1String(PyObject *unicode)
7538
0
{
7539
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7540
0
}
7541
7542
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7543
7544
PyObject *
7545
PyUnicode_DecodeASCII(const char *s,
7546
                      Py_ssize_t size,
7547
                      const char *errors)
7548
750k
{
7549
750k
    const char *starts = s;
7550
750k
    const char *e = s + size;
7551
750k
    PyObject *error_handler_obj = NULL;
7552
750k
    PyObject *exc = NULL;
7553
750k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7554
7555
750k
    if (size == 0)
7556
0
        _Py_RETURN_UNICODE_EMPTY();
7557
7558
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7559
750k
    if (size == 1 && (unsigned char)s[0] < 128) {
7560
5.73k
        return get_latin1_char((unsigned char)s[0]);
7561
5.73k
    }
7562
7563
    // Shortcut for simple case
7564
745k
    PyObject *u = PyUnicode_New(size, 127);
7565
745k
    if (u == NULL) {
7566
0
        return NULL;
7567
0
    }
7568
745k
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7569
745k
    if (outpos == size) {
7570
605k
        return u;
7571
605k
    }
7572
7573
139k
    _PyUnicodeWriter writer;
7574
139k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7575
139k
    writer.pos = outpos;
7576
7577
139k
    s += outpos;
7578
139k
    int kind = writer.kind;
7579
139k
    void *data = writer.data;
7580
139k
    Py_ssize_t startinpos, endinpos;
7581
7582
17.5M
    while (s < e) {
7583
17.4M
        unsigned char c = (unsigned char)*s;
7584
17.4M
        if (c < 128) {
7585
6.94M
            PyUnicode_WRITE(kind, data, writer.pos, c);
7586
6.94M
            writer.pos++;
7587
6.94M
            ++s;
7588
6.94M
            continue;
7589
6.94M
        }
7590
7591
        /* byte outsize range 0x00..0x7f: call the error handler */
7592
7593
10.4M
        if (error_handler == _Py_ERROR_UNKNOWN)
7594
139k
            error_handler = _Py_GetErrorHandler(errors);
7595
7596
10.4M
        switch (error_handler)
7597
10.4M
        {
7598
684k
        case _Py_ERROR_REPLACE:
7599
10.4M
        case _Py_ERROR_SURROGATEESCAPE:
7600
            /* Fast-path: the error handler only writes one character,
7601
               but we may switch to UCS2 at the first write */
7602
10.4M
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7603
0
                goto onError;
7604
10.4M
            kind = writer.kind;
7605
10.4M
            data = writer.data;
7606
7607
10.4M
            if (error_handler == _Py_ERROR_REPLACE)
7608
684k
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7609
9.78M
            else
7610
9.78M
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7611
10.4M
            writer.pos++;
7612
10.4M
            ++s;
7613
10.4M
            break;
7614
7615
0
        case _Py_ERROR_IGNORE:
7616
0
            ++s;
7617
0
            break;
7618
7619
7.36k
        default:
7620
7.36k
            startinpos = s-starts;
7621
7.36k
            endinpos = startinpos + 1;
7622
7.36k
            if (unicode_decode_call_errorhandler_writer(
7623
7.36k
                    errors, &error_handler_obj,
7624
7.36k
                    "ascii", "ordinal not in range(128)",
7625
7.36k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7626
7.36k
                    &writer))
7627
7.36k
                goto onError;
7628
0
            kind = writer.kind;
7629
0
            data = writer.data;
7630
10.4M
        }
7631
10.4M
    }
7632
132k
    Py_XDECREF(error_handler_obj);
7633
132k
    Py_XDECREF(exc);
7634
132k
    return _PyUnicodeWriter_Finish(&writer);
7635
7636
7.36k
  onError:
7637
7.36k
    _PyUnicodeWriter_Dealloc(&writer);
7638
7.36k
    Py_XDECREF(error_handler_obj);
7639
7.36k
    Py_XDECREF(exc);
7640
7.36k
    return NULL;
7641
139k
}
7642
7643
PyObject *
7644
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7645
123k
{
7646
123k
    if (!PyUnicode_Check(unicode)) {
7647
0
        PyErr_BadArgument();
7648
0
        return NULL;
7649
0
    }
7650
    /* Fast path: if it is an ASCII-only string, construct bytes object
7651
       directly. Else defer to above function to raise the exception. */
7652
123k
    if (PyUnicode_IS_ASCII(unicode))
7653
79.2k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7654
79.2k
                                         PyUnicode_GET_LENGTH(unicode));
7655
44.0k
    return unicode_encode_ucs1(unicode, errors, 128);
7656
123k
}
7657
7658
PyObject *
7659
PyUnicode_AsASCIIString(PyObject *unicode)
7660
4
{
7661
4
    return _PyUnicode_AsASCIIString(unicode, NULL);
7662
4
}
7663
7664
#ifdef MS_WINDOWS
7665
7666
/* --- MBCS codecs for Windows -------------------------------------------- */
7667
7668
#if SIZEOF_INT < SIZEOF_SIZE_T
7669
#define NEED_RETRY
7670
#endif
7671
7672
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7673
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7674
   both cases also and avoids partial characters overrunning the
7675
   length limit in MultiByteToWideChar on Windows */
7676
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7677
7678
#ifndef WC_ERR_INVALID_CHARS
7679
#  define WC_ERR_INVALID_CHARS 0x0080
7680
#endif
7681
7682
static const char*
7683
code_page_name(UINT code_page, PyObject **obj)
7684
{
7685
    *obj = NULL;
7686
    if (code_page == CP_ACP)
7687
        return "mbcs";
7688
    if (code_page == CP_UTF7)
7689
        return "CP_UTF7";
7690
    if (code_page == CP_UTF8)
7691
        return "CP_UTF8";
7692
7693
    *obj = PyBytes_FromFormat("cp%u", code_page);
7694
    if (*obj == NULL)
7695
        return NULL;
7696
    return PyBytes_AS_STRING(*obj);
7697
}
7698
7699
static DWORD
7700
decode_code_page_flags(UINT code_page)
7701
{
7702
    if (code_page == CP_UTF7) {
7703
        /* The CP_UTF7 decoder only supports flags=0 */
7704
        return 0;
7705
    }
7706
    else
7707
        return MB_ERR_INVALID_CHARS;
7708
}
7709
7710
/*
7711
 * Decode a byte string from a Windows code page into unicode object in strict
7712
 * mode.
7713
 *
7714
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7715
 * OSError and returns -1 on other error.
7716
 */
7717
static int
7718
decode_code_page_strict(UINT code_page,
7719
                        wchar_t **buf,
7720
                        Py_ssize_t *bufsize,
7721
                        const char *in,
7722
                        int insize)
7723
{
7724
    DWORD flags = MB_ERR_INVALID_CHARS;
7725
    wchar_t *out;
7726
    DWORD outsize;
7727
7728
    /* First get the size of the result */
7729
    assert(insize > 0);
7730
    while ((outsize = MultiByteToWideChar(code_page, flags,
7731
                                          in, insize, NULL, 0)) <= 0)
7732
    {
7733
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7734
            goto error;
7735
        }
7736
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7737
        flags = 0;
7738
    }
7739
7740
    /* Extend a wchar_t* buffer */
7741
    Py_ssize_t n = *bufsize;   /* Get the current length */
7742
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7743
        return -1;
7744
    }
7745
    out = *buf + n;
7746
7747
    /* Do the conversion */
7748
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7749
    if (outsize <= 0)
7750
        goto error;
7751
    return insize;
7752
7753
error:
7754
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7755
        return -2;
7756
    PyErr_SetFromWindowsErr(0);
7757
    return -1;
7758
}
7759
7760
/*
7761
 * Decode a byte string from a code page into unicode object with an error
7762
 * handler.
7763
 *
7764
 * Returns consumed size if succeed, or raise an OSError or
7765
 * UnicodeDecodeError exception and returns -1 on error.
7766
 */
7767
static int
7768
decode_code_page_errors(UINT code_page,
7769
                        wchar_t **buf,
7770
                        Py_ssize_t *bufsize,
7771
                        const char *in, const int size,
7772
                        const char *errors, int final)
7773
{
7774
    const char *startin = in;
7775
    const char *endin = in + size;
7776
    DWORD flags = MB_ERR_INVALID_CHARS;
7777
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7778
       2000 English version of the message. */
7779
    const char *reason = "No mapping for the Unicode character exists "
7780
                         "in the target code page.";
7781
    /* each step cannot decode more than 1 character, but a character can be
7782
       represented as a surrogate pair */
7783
    wchar_t buffer[2], *out;
7784
    int insize;
7785
    Py_ssize_t outsize;
7786
    PyObject *errorHandler = NULL;
7787
    PyObject *exc = NULL;
7788
    PyObject *encoding_obj = NULL;
7789
    const char *encoding;
7790
    DWORD err;
7791
    int ret = -1;
7792
7793
    assert(size > 0);
7794
7795
    encoding = code_page_name(code_page, &encoding_obj);
7796
    if (encoding == NULL)
7797
        return -1;
7798
7799
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7800
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7801
           UnicodeDecodeError. */
7802
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7803
        if (exc != NULL) {
7804
            PyCodec_StrictErrors(exc);
7805
            Py_CLEAR(exc);
7806
        }
7807
        goto error;
7808
    }
7809
7810
    /* Extend a wchar_t* buffer */
7811
    Py_ssize_t n = *bufsize;   /* Get the current length */
7812
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7813
        PyErr_NoMemory();
7814
        goto error;
7815
    }
7816
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7817
        goto error;
7818
    }
7819
    out = *buf + n;
7820
7821
    /* Decode the byte string character per character */
7822
    while (in < endin)
7823
    {
7824
        /* Decode a character */
7825
        insize = 1;
7826
        do
7827
        {
7828
            outsize = MultiByteToWideChar(code_page, flags,
7829
                                          in, insize,
7830
                                          buffer, Py_ARRAY_LENGTH(buffer));
7831
            if (outsize > 0)
7832
                break;
7833
            err = GetLastError();
7834
            if (err == ERROR_INVALID_FLAGS && flags) {
7835
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7836
                flags = 0;
7837
                continue;
7838
            }
7839
            if (err != ERROR_NO_UNICODE_TRANSLATION
7840
                && err != ERROR_INSUFFICIENT_BUFFER)
7841
            {
7842
                PyErr_SetFromWindowsErr(err);
7843
                goto error;
7844
            }
7845
            insize++;
7846
        }
7847
        /* 4=maximum length of a UTF-8 sequence */
7848
        while (insize <= 4 && (in + insize) <= endin);
7849
7850
        if (outsize <= 0) {
7851
            Py_ssize_t startinpos, endinpos, outpos;
7852
7853
            /* last character in partial decode? */
7854
            if (in + insize >= endin && !final)
7855
                break;
7856
7857
            startinpos = in - startin;
7858
            endinpos = startinpos + 1;
7859
            outpos = out - *buf;
7860
            if (unicode_decode_call_errorhandler_wchar(
7861
                    errors, &errorHandler,
7862
                    encoding, reason,
7863
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7864
                    buf, bufsize, &outpos))
7865
            {
7866
                goto error;
7867
            }
7868
            out = *buf + outpos;
7869
        }
7870
        else {
7871
            in += insize;
7872
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7873
            out += outsize;
7874
        }
7875
    }
7876
7877
    /* Shrink the buffer */
7878
    assert(out - *buf <= *bufsize);
7879
    *bufsize = out - *buf;
7880
    /* (in - startin) <= size and size is an int */
7881
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7882
7883
error:
7884
    Py_XDECREF(encoding_obj);
7885
    Py_XDECREF(errorHandler);
7886
    Py_XDECREF(exc);
7887
    return ret;
7888
}
7889
7890
static PyObject *
7891
decode_code_page_stateful(int code_page,
7892
                          const char *s, Py_ssize_t size,
7893
                          const char *errors, Py_ssize_t *consumed)
7894
{
7895
    wchar_t *buf = NULL;
7896
    Py_ssize_t bufsize = 0;
7897
    int chunk_size, final, converted, done;
7898
7899
    if (code_page < 0) {
7900
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7901
        return NULL;
7902
    }
7903
    if (size < 0) {
7904
        PyErr_BadInternalCall();
7905
        return NULL;
7906
    }
7907
7908
    if (consumed)
7909
        *consumed = 0;
7910
7911
    do
7912
    {
7913
#ifdef NEED_RETRY
7914
        if (size > DECODING_CHUNK_SIZE) {
7915
            chunk_size = DECODING_CHUNK_SIZE;
7916
            final = 0;
7917
            done = 0;
7918
        }
7919
        else
7920
#endif
7921
        {
7922
            chunk_size = (int)size;
7923
            final = (consumed == NULL);
7924
            done = 1;
7925
        }
7926
7927
        if (chunk_size == 0 && done) {
7928
            if (buf != NULL)
7929
                break;
7930
            _Py_RETURN_UNICODE_EMPTY();
7931
        }
7932
7933
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7934
                                            s, chunk_size);
7935
        if (converted == -2)
7936
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7937
                                                s, chunk_size,
7938
                                                errors, final);
7939
        assert(converted != 0 || done);
7940
7941
        if (converted < 0) {
7942
            PyMem_Free(buf);
7943
            return NULL;
7944
        }
7945
7946
        if (consumed)
7947
            *consumed += converted;
7948
7949
        s += converted;
7950
        size -= converted;
7951
    } while (!done);
7952
7953
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7954
    PyMem_Free(buf);
7955
    return v;
7956
}
7957
7958
PyObject *
7959
PyUnicode_DecodeCodePageStateful(int code_page,
7960
                                 const char *s,
7961
                                 Py_ssize_t size,
7962
                                 const char *errors,
7963
                                 Py_ssize_t *consumed)
7964
{
7965
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7966
}
7967
7968
PyObject *
7969
PyUnicode_DecodeMBCSStateful(const char *s,
7970
                             Py_ssize_t size,
7971
                             const char *errors,
7972
                             Py_ssize_t *consumed)
7973
{
7974
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7975
}
7976
7977
PyObject *
7978
PyUnicode_DecodeMBCS(const char *s,
7979
                     Py_ssize_t size,
7980
                     const char *errors)
7981
{
7982
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7983
}
7984
7985
static DWORD
7986
encode_code_page_flags(UINT code_page, const char *errors)
7987
{
7988
    if (code_page == CP_UTF8) {
7989
        return WC_ERR_INVALID_CHARS;
7990
    }
7991
    else if (code_page == CP_UTF7) {
7992
        /* CP_UTF7 only supports flags=0 */
7993
        return 0;
7994
    }
7995
    else {
7996
        if (errors != NULL && strcmp(errors, "replace") == 0)
7997
            return 0;
7998
        else
7999
            return WC_NO_BEST_FIT_CHARS;
8000
    }
8001
}
8002
8003
/*
8004
 * Encode a Unicode string to a Windows code page into a byte string in strict
8005
 * mode.
8006
 *
8007
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
8008
 * an OSError and returns -1 on other error.
8009
 */
8010
static int
8011
encode_code_page_strict(UINT code_page, PyObject **outbytes,
8012
                        PyObject *unicode, Py_ssize_t offset, int len,
8013
                        const char* errors)
8014
{
8015
    BOOL usedDefaultChar = FALSE;
8016
    BOOL *pusedDefaultChar = &usedDefaultChar;
8017
    int outsize;
8018
    wchar_t *p;
8019
    Py_ssize_t size;
8020
    const DWORD flags = encode_code_page_flags(code_page, NULL);
8021
    char *out;
8022
    /* Create a substring so that we can get the UTF-16 representation
8023
       of just the slice under consideration. */
8024
    PyObject *substring;
8025
    int ret = -1;
8026
8027
    assert(len > 0);
8028
8029
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
8030
        pusedDefaultChar = &usedDefaultChar;
8031
    else
8032
        pusedDefaultChar = NULL;
8033
8034
    substring = PyUnicode_Substring(unicode, offset, offset+len);
8035
    if (substring == NULL)
8036
        return -1;
8037
    p = PyUnicode_AsWideCharString(substring, &size);
8038
    Py_CLEAR(substring);
8039
    if (p == NULL) {
8040
        return -1;
8041
    }
8042
    assert(size <= INT_MAX);
8043
8044
    /* First get the size of the result */
8045
    outsize = WideCharToMultiByte(code_page, flags,
8046
                                  p, (int)size,
8047
                                  NULL, 0,
8048
                                  NULL, pusedDefaultChar);
8049
    if (outsize <= 0)
8050
        goto error;
8051
    /* If we used a default char, then we failed! */
8052
    if (pusedDefaultChar && *pusedDefaultChar) {
8053
        ret = -2;
8054
        goto done;
8055
    }
8056
8057
    if (*outbytes == NULL) {
8058
        /* Create string object */
8059
        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
8060
        if (*outbytes == NULL) {
8061
            goto done;
8062
        }
8063
        out = PyBytes_AS_STRING(*outbytes);
8064
    }
8065
    else {
8066
        /* Extend string object */
8067
        const Py_ssize_t n = PyBytes_Size(*outbytes);
8068
        if (outsize > PY_SSIZE_T_MAX - n) {
8069
            PyErr_NoMemory();
8070
            goto done;
8071
        }
8072
        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
8073
            goto done;
8074
        }
8075
        out = PyBytes_AS_STRING(*outbytes) + n;
8076
    }
8077
8078
    /* Do the conversion */
8079
    outsize = WideCharToMultiByte(code_page, flags,
8080
                                  p, (int)size,
8081
                                  out, outsize,
8082
                                  NULL, pusedDefaultChar);
8083
    if (outsize <= 0)
8084
        goto error;
8085
    if (pusedDefaultChar && *pusedDefaultChar) {
8086
        ret = -2;
8087
        goto done;
8088
    }
8089
    ret = 0;
8090
8091
done:
8092
    PyMem_Free(p);
8093
    return ret;
8094
8095
error:
8096
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
8097
        ret = -2;
8098
        goto done;
8099
    }
8100
    PyErr_SetFromWindowsErr(0);
8101
    goto done;
8102
}
8103
8104
/*
8105
 * Encode a Unicode string to a Windows code page into a byte string using an
8106
 * error handler.
8107
 *
8108
 * Returns consumed characters if succeed, or raise an OSError and returns
8109
 * -1 on other error.
8110
 */
8111
static int
8112
encode_code_page_errors(UINT code_page, PyObject **outbytes,
8113
                        PyObject *unicode, Py_ssize_t unicode_offset,
8114
                        Py_ssize_t insize, const char* errors)
8115
{
8116
    const DWORD flags = encode_code_page_flags(code_page, errors);
8117
    Py_ssize_t pos = unicode_offset;
8118
    Py_ssize_t endin = unicode_offset + insize;
8119
    /* Ideally, we should get reason from FormatMessage. This is the Windows
8120
       2000 English version of the message. */
8121
    const char *reason = "invalid character";
8122
    /* 4=maximum length of a UTF-8 sequence */
8123
    char buffer[4];
8124
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
8125
    Py_ssize_t outsize;
8126
    char *out;
8127
    PyObject *errorHandler = NULL;
8128
    PyObject *exc = NULL;
8129
    PyObject *encoding_obj = NULL;
8130
    const char *encoding;
8131
    Py_ssize_t newpos, newoutsize;
8132
    PyObject *rep;
8133
    int ret = -1;
8134
8135
    assert(insize > 0);
8136
8137
    encoding = code_page_name(code_page, &encoding_obj);
8138
    if (encoding == NULL)
8139
        return -1;
8140
8141
    if (errors == NULL || strcmp(errors, "strict") == 0) {
8142
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
8143
           then we raise a UnicodeEncodeError. */
8144
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
8145
        if (exc != NULL) {
8146
            PyCodec_StrictErrors(exc);
8147
            Py_DECREF(exc);
8148
        }
8149
        Py_XDECREF(encoding_obj);
8150
        return -1;
8151
    }
8152
8153
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
8154
        pusedDefaultChar = &usedDefaultChar;
8155
    else
8156
        pusedDefaultChar = NULL;
8157
8158
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
8159
        PyErr_NoMemory();
8160
        goto error;
8161
    }
8162
    outsize = insize * Py_ARRAY_LENGTH(buffer);
8163
8164
    if (*outbytes == NULL) {
8165
        /* Create string object */
8166
        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
8167
        if (*outbytes == NULL)
8168
            goto error;
8169
        out = PyBytes_AS_STRING(*outbytes);
8170
    }
8171
    else {
8172
        /* Extend string object */
8173
        Py_ssize_t n = PyBytes_Size(*outbytes);
8174
        if (n > PY_SSIZE_T_MAX - outsize) {
8175
            PyErr_NoMemory();
8176
            goto error;
8177
        }
8178
        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
8179
            goto error;
8180
        out = PyBytes_AS_STRING(*outbytes) + n;
8181
    }
8182
8183
    /* Encode the string character per character */
8184
    while (pos < endin)
8185
    {
8186
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8187
        wchar_t chars[2];
8188
        int charsize;
8189
        if (ch < 0x10000) {
8190
            chars[0] = (wchar_t)ch;
8191
            charsize = 1;
8192
        }
8193
        else {
8194
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8195
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8196
            charsize = 2;
8197
        }
8198
8199
        outsize = WideCharToMultiByte(code_page, flags,
8200
                                      chars, charsize,
8201
                                      buffer, Py_ARRAY_LENGTH(buffer),
8202
                                      NULL, pusedDefaultChar);
8203
        if (outsize > 0) {
8204
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8205
            {
8206
                pos++;
8207
                memcpy(out, buffer, outsize);
8208
                out += outsize;
8209
                continue;
8210
            }
8211
        }
8212
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8213
            PyErr_SetFromWindowsErr(0);
8214
            goto error;
8215
        }
8216
8217
        rep = unicode_encode_call_errorhandler(
8218
                  errors, &errorHandler, encoding, reason,
8219
                  unicode, &exc,
8220
                  pos, pos + 1, &newpos);
8221
        if (rep == NULL)
8222
            goto error;
8223
8224
        Py_ssize_t morebytes = pos - newpos;
8225
        if (PyBytes_Check(rep)) {
8226
            outsize = PyBytes_GET_SIZE(rep);
8227
            morebytes += outsize;
8228
            if (morebytes > 0) {
8229
                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8230
                newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
8231
                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8232
                    Py_DECREF(rep);
8233
                    goto error;
8234
                }
8235
                out = PyBytes_AS_STRING(*outbytes) + offset;
8236
            }
8237
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8238
            out += outsize;
8239
        }
8240
        else {
8241
            Py_ssize_t i;
8242
            int kind;
8243
            const void *data;
8244
8245
            outsize = PyUnicode_GET_LENGTH(rep);
8246
            morebytes += outsize;
8247
            if (morebytes > 0) {
8248
                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8249
                newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
8250
                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8251
                    Py_DECREF(rep);
8252
                    goto error;
8253
                }
8254
                out = PyBytes_AS_STRING(*outbytes) + offset;
8255
            }
8256
            kind = PyUnicode_KIND(rep);
8257
            data = PyUnicode_DATA(rep);
8258
            for (i=0; i < outsize; i++) {
8259
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8260
                if (ch > 127) {
8261
                    raise_encode_exception(&exc,
8262
                        encoding, unicode,
8263
                        pos, pos + 1,
8264
                        "unable to encode error handler result to ASCII");
8265
                    Py_DECREF(rep);
8266
                    goto error;
8267
                }
8268
                *out = (unsigned char)ch;
8269
                out++;
8270
            }
8271
        }
8272
        pos = newpos;
8273
        Py_DECREF(rep);
8274
    }
8275
    /* write a NUL byte */
8276
    *out = 0;
8277
    outsize = out - PyBytes_AS_STRING(*outbytes);
8278
    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8279
    if (_PyBytes_Resize(outbytes, outsize) < 0)
8280
        goto error;
8281
    ret = 0;
8282
8283
error:
8284
    Py_XDECREF(encoding_obj);
8285
    Py_XDECREF(errorHandler);
8286
    Py_XDECREF(exc);
8287
    return ret;
8288
}
8289
8290
static PyObject *
8291
encode_code_page(int code_page,
8292
                 PyObject *unicode,
8293
                 const char *errors)
8294
{
8295
    Py_ssize_t len;
8296
    PyObject *outbytes = NULL;
8297
    Py_ssize_t offset;
8298
    int chunk_len, ret, done;
8299
8300
    if (!PyUnicode_Check(unicode)) {
8301
        PyErr_BadArgument();
8302
        return NULL;
8303
    }
8304
8305
    len = PyUnicode_GET_LENGTH(unicode);
8306
8307
    if (code_page < 0) {
8308
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8309
        return NULL;
8310
    }
8311
8312
    if (len == 0)
8313
        return PyBytes_FromStringAndSize(NULL, 0);
8314
8315
    offset = 0;
8316
    do
8317
    {
8318
#ifdef NEED_RETRY
8319
        if (len > DECODING_CHUNK_SIZE) {
8320
            chunk_len = DECODING_CHUNK_SIZE;
8321
            done = 0;
8322
        }
8323
        else
8324
#endif
8325
        {
8326
            chunk_len = (int)len;
8327
            done = 1;
8328
        }
8329
8330
        ret = encode_code_page_strict(code_page, &outbytes,
8331
                                      unicode, offset, chunk_len,
8332
                                      errors);
8333
        if (ret == -2)
8334
            ret = encode_code_page_errors(code_page, &outbytes,
8335
                                          unicode, offset,
8336
                                          chunk_len, errors);
8337
        if (ret < 0) {
8338
            Py_XDECREF(outbytes);
8339
            return NULL;
8340
        }
8341
8342
        offset += chunk_len;
8343
        len -= chunk_len;
8344
    } while (!done);
8345
8346
    return outbytes;
8347
}
8348
8349
PyObject *
8350
PyUnicode_EncodeCodePage(int code_page,
8351
                         PyObject *unicode,
8352
                         const char *errors)
8353
{
8354
    return encode_code_page(code_page, unicode, errors);
8355
}
8356
8357
PyObject *
8358
PyUnicode_AsMBCSString(PyObject *unicode)
8359
{
8360
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8361
}
8362
8363
#undef NEED_RETRY
8364
8365
#endif /* MS_WINDOWS */
8366
8367
/* --- Character Mapping Codec -------------------------------------------- */
8368
8369
static int
8370
charmap_decode_string(const char *s,
8371
                      Py_ssize_t size,
8372
                      PyObject *mapping,
8373
                      const char *errors,
8374
                      _PyUnicodeWriter *writer)
8375
8.55k
{
8376
8.55k
    const char *starts = s;
8377
8.55k
    const char *e;
8378
8.55k
    Py_ssize_t startinpos, endinpos;
8379
8.55k
    PyObject *errorHandler = NULL, *exc = NULL;
8380
8.55k
    Py_ssize_t maplen;
8381
8.55k
    int mapkind;
8382
8.55k
    const void *mapdata;
8383
8.55k
    Py_UCS4 x;
8384
8.55k
    unsigned char ch;
8385
8386
8.55k
    maplen = PyUnicode_GET_LENGTH(mapping);
8387
8.55k
    mapdata = PyUnicode_DATA(mapping);
8388
8.55k
    mapkind = PyUnicode_KIND(mapping);
8389
8390
8.55k
    e = s + size;
8391
8392
8.55k
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8393
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8394
         * is disabled in encoding aliases, latin1 is preferred because
8395
         * its implementation is faster. */
8396
155
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8397
155
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8398
155
        Py_UCS4 maxchar = writer->maxchar;
8399
8400
155
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8401
2.22k
        while (s < e) {
8402
2.06k
            ch = *s;
8403
2.06k
            x = mapdata_ucs1[ch];
8404
2.06k
            if (x > maxchar) {
8405
146
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8406
0
                    goto onError;
8407
146
                maxchar = writer->maxchar;
8408
146
                outdata = (Py_UCS1 *)writer->data;
8409
146
            }
8410
2.06k
            outdata[writer->pos] = x;
8411
2.06k
            writer->pos++;
8412
2.06k
            ++s;
8413
2.06k
        }
8414
155
        return 0;
8415
155
    }
8416
8417
33.2k
    while (s < e) {
8418
29.3k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8419
29.3k
            int outkind = writer->kind;
8420
29.3k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8421
29.3k
            if (outkind == PyUnicode_1BYTE_KIND) {
8422
15.5k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8423
15.5k
                Py_UCS4 maxchar = writer->maxchar;
8424
66.8k
                while (s < e) {
8425
65.4k
                    ch = *s;
8426
65.4k
                    x = mapdata_ucs2[ch];
8427
65.4k
                    if (x > maxchar)
8428
14.2k
                        goto Error;
8429
51.2k
                    outdata[writer->pos] = x;
8430
51.2k
                    writer->pos++;
8431
51.2k
                    ++s;
8432
51.2k
                }
8433
1.32k
                break;
8434
15.5k
            }
8435
13.8k
            else if (outkind == PyUnicode_2BYTE_KIND) {
8436
13.8k
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8437
241k
                while (s < e) {
8438
238k
                    ch = *s;
8439
238k
                    x = mapdata_ucs2[ch];
8440
238k
                    if (x == 0xFFFE)
8441
10.6k
                        goto Error;
8442
227k
                    outdata[writer->pos] = x;
8443
227k
                    writer->pos++;
8444
227k
                    ++s;
8445
227k
                }
8446
3.19k
                break;
8447
13.8k
            }
8448
29.3k
        }
8449
0
        ch = *s;
8450
8451
0
        if (ch < maplen)
8452
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8453
0
        else
8454
0
            x = 0xfffe; /* invalid value */
8455
24.8k
Error:
8456
24.8k
        if (x == 0xfffe)
8457
16.2k
        {
8458
            /* undefined mapping */
8459
16.2k
            startinpos = s-starts;
8460
16.2k
            endinpos = startinpos+1;
8461
16.2k
            if (unicode_decode_call_errorhandler_writer(
8462
16.2k
                    errors, &errorHandler,
8463
16.2k
                    "charmap", "character maps to <undefined>",
8464
16.2k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8465
16.2k
                    writer)) {
8466
19
                goto onError;
8467
19
            }
8468
16.2k
            continue;
8469
16.2k
        }
8470
8471
8.63k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8472
0
            goto onError;
8473
8.63k
        ++s;
8474
8.63k
    }
8475
8.38k
    Py_XDECREF(errorHandler);
8476
8.38k
    Py_XDECREF(exc);
8477
8.38k
    return 0;
8478
8479
19
onError:
8480
19
    Py_XDECREF(errorHandler);
8481
19
    Py_XDECREF(exc);
8482
19
    return -1;
8483
8.40k
}
8484
8485
static int
8486
charmap_decode_mapping(const char *s,
8487
                       Py_ssize_t size,
8488
                       PyObject *mapping,
8489
                       const char *errors,
8490
                       _PyUnicodeWriter *writer)
8491
0
{
8492
0
    const char *starts = s;
8493
0
    const char *e;
8494
0
    Py_ssize_t startinpos, endinpos;
8495
0
    PyObject *errorHandler = NULL, *exc = NULL;
8496
0
    unsigned char ch;
8497
0
    PyObject *key, *item = NULL;
8498
8499
0
    e = s + size;
8500
8501
0
    while (s < e) {
8502
0
        ch = *s;
8503
8504
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8505
0
        key = PyLong_FromLong((long)ch);
8506
0
        if (key == NULL)
8507
0
            goto onError;
8508
8509
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8510
0
        Py_DECREF(key);
8511
0
        if (rc == 0) {
8512
            /* No mapping found means: mapping is undefined. */
8513
0
            goto Undefined;
8514
0
        }
8515
0
        if (item == NULL) {
8516
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8517
                /* No mapping found means: mapping is undefined. */
8518
0
                PyErr_Clear();
8519
0
                goto Undefined;
8520
0
            } else
8521
0
                goto onError;
8522
0
        }
8523
8524
        /* Apply mapping */
8525
0
        if (item == Py_None)
8526
0
            goto Undefined;
8527
0
        if (PyLong_Check(item)) {
8528
0
            long value = PyLong_AsLong(item);
8529
0
            if (value == 0xFFFE)
8530
0
                goto Undefined;
8531
0
            if (value < 0 || value > MAX_UNICODE) {
8532
0
                PyErr_Format(PyExc_TypeError,
8533
0
                             "character mapping must be in range(0x%x)",
8534
0
                             (unsigned long)MAX_UNICODE + 1);
8535
0
                goto onError;
8536
0
            }
8537
8538
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8539
0
                goto onError;
8540
0
        }
8541
0
        else if (PyUnicode_Check(item)) {
8542
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8543
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8544
0
                if (value == 0xFFFE)
8545
0
                    goto Undefined;
8546
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8547
0
                    goto onError;
8548
0
            }
8549
0
            else {
8550
0
                writer->overallocate = 1;
8551
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8552
0
                    goto onError;
8553
0
            }
8554
0
        }
8555
0
        else {
8556
            /* wrong return value */
8557
0
            PyErr_SetString(PyExc_TypeError,
8558
0
                            "character mapping must return integer, None or str");
8559
0
            goto onError;
8560
0
        }
8561
0
        Py_CLEAR(item);
8562
0
        ++s;
8563
0
        continue;
8564
8565
0
Undefined:
8566
        /* undefined mapping */
8567
0
        Py_CLEAR(item);
8568
0
        startinpos = s-starts;
8569
0
        endinpos = startinpos+1;
8570
0
        if (unicode_decode_call_errorhandler_writer(
8571
0
                errors, &errorHandler,
8572
0
                "charmap", "character maps to <undefined>",
8573
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8574
0
                writer)) {
8575
0
            goto onError;
8576
0
        }
8577
0
    }
8578
0
    Py_XDECREF(errorHandler);
8579
0
    Py_XDECREF(exc);
8580
0
    return 0;
8581
8582
0
onError:
8583
0
    Py_XDECREF(item);
8584
0
    Py_XDECREF(errorHandler);
8585
0
    Py_XDECREF(exc);
8586
0
    return -1;
8587
0
}
8588
8589
PyObject *
8590
PyUnicode_DecodeCharmap(const char *s,
8591
                        Py_ssize_t size,
8592
                        PyObject *mapping,
8593
                        const char *errors)
8594
8.55k
{
8595
8.55k
    _PyUnicodeWriter writer;
8596
8597
    /* Default to Latin-1 */
8598
8.55k
    if (mapping == NULL)
8599
0
        return PyUnicode_DecodeLatin1(s, size, errors);
8600
8601
8.55k
    if (size == 0)
8602
0
        _Py_RETURN_UNICODE_EMPTY();
8603
8.55k
    _PyUnicodeWriter_Init(&writer);
8604
8.55k
    writer.min_length = size;
8605
8.55k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8606
0
        goto onError;
8607
8608
8.55k
    if (PyUnicode_CheckExact(mapping)) {
8609
8.55k
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8610
19
            goto onError;
8611
8.55k
    }
8612
0
    else {
8613
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8614
0
            goto onError;
8615
0
    }
8616
8.53k
    return _PyUnicodeWriter_Finish(&writer);
8617
8618
19
  onError:
8619
19
    _PyUnicodeWriter_Dealloc(&writer);
8620
19
    return NULL;
8621
8.55k
}
8622
8623
/* Charmap encoding: the lookup table */
8624
8625
/*[clinic input]
8626
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8627
[clinic start generated code]*/
8628
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8629
8630
struct encoding_map {
8631
    PyObject_HEAD
8632
    unsigned char level1[32];
8633
    int count2, count3;
8634
    unsigned char level23[1];
8635
};
8636
8637
/*[clinic input]
8638
EncodingMap.size
8639
8640
Return the size (in bytes) of this object.
8641
[clinic start generated code]*/
8642
8643
static PyObject *
8644
EncodingMap_size_impl(struct encoding_map *self)
8645
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8646
0
{
8647
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8648
0
                           128*self->count3);
8649
0
}
8650
8651
static PyMethodDef encoding_map_methods[] = {
8652
    ENCODINGMAP_SIZE_METHODDEF
8653
    {NULL, NULL}
8654
};
8655
8656
static PyTypeObject EncodingMapType = {
8657
    PyVarObject_HEAD_INIT(NULL, 0)
8658
    .tp_name = "EncodingMap",
8659
    .tp_basicsize = sizeof(struct encoding_map),
8660
    /* methods */
8661
    .tp_flags = Py_TPFLAGS_DEFAULT,
8662
    .tp_methods = encoding_map_methods,
8663
};
8664
8665
PyObject*
8666
PyUnicode_BuildEncodingMap(PyObject* string)
8667
115
{
8668
115
    PyObject *result;
8669
115
    struct encoding_map *mresult;
8670
115
    int i;
8671
115
    int need_dict = 0;
8672
115
    unsigned char level1[32];
8673
115
    unsigned char level2[512];
8674
115
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8675
115
    int count2 = 0, count3 = 0;
8676
115
    int kind;
8677
115
    const void *data;
8678
115
    int length;
8679
115
    Py_UCS4 ch;
8680
8681
115
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8682
0
        PyErr_BadArgument();
8683
0
        return NULL;
8684
0
    }
8685
115
    kind = PyUnicode_KIND(string);
8686
115
    data = PyUnicode_DATA(string);
8687
115
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8688
115
    memset(level1, 0xFF, sizeof level1);
8689
115
    memset(level2, 0xFF, sizeof level2);
8690
8691
    /* If there isn't a one-to-one mapping of NULL to \0,
8692
       or if there are non-BMP characters, we need to use
8693
       a mapping dictionary. */
8694
115
    if (PyUnicode_READ(kind, data, 0) != 0)
8695
0
        need_dict = 1;
8696
29.4k
    for (i = 1; i < length; i++) {
8697
29.3k
        int l1, l2;
8698
29.3k
        ch = PyUnicode_READ(kind, data, i);
8699
29.3k
        if (ch == 0 || ch > 0xFFFF) {
8700
0
            need_dict = 1;
8701
0
            break;
8702
0
        }
8703
29.3k
        if (ch == 0xFFFE)
8704
            /* unmapped character */
8705
773
            continue;
8706
28.5k
        l1 = ch >> 11;
8707
28.5k
        l2 = ch >> 7;
8708
28.5k
        if (level1[l1] == 0xFF)
8709
209
            level1[l1] = count2++;
8710
28.5k
        if (level2[l2] == 0xFF)
8711
625
            level2[l2] = count3++;
8712
28.5k
    }
8713
8714
115
    if (count2 >= 0xFF || count3 >= 0xFF)
8715
0
        need_dict = 1;
8716
8717
115
    if (need_dict) {
8718
0
        PyObject *result = PyDict_New();
8719
0
        if (!result)
8720
0
            return NULL;
8721
0
        for (i = 0; i < length; i++) {
8722
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8723
0
            PyObject *key = PyLong_FromLong(c);
8724
0
            if (key == NULL) {
8725
0
                Py_DECREF(result);
8726
0
                return NULL;
8727
0
            }
8728
0
            PyObject *value = PyLong_FromLong(i);
8729
0
            if (value == NULL) {
8730
0
                Py_DECREF(key);
8731
0
                Py_DECREF(result);
8732
0
                return NULL;
8733
0
            }
8734
0
            int rc = PyDict_SetItem(result, key, value);
8735
0
            Py_DECREF(key);
8736
0
            Py_DECREF(value);
8737
0
            if (rc < 0) {
8738
0
                Py_DECREF(result);
8739
0
                return NULL;
8740
0
            }
8741
0
        }
8742
0
        return result;
8743
0
    }
8744
8745
    /* Create a three-level trie */
8746
115
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8747
115
                             16*count2 + 128*count3 - 1);
8748
115
    if (!result) {
8749
0
        return PyErr_NoMemory();
8750
0
    }
8751
8752
115
    _PyObject_Init(result, &EncodingMapType);
8753
115
    mresult = (struct encoding_map*)result;
8754
115
    mresult->count2 = count2;
8755
115
    mresult->count3 = count3;
8756
115
    mlevel1 = mresult->level1;
8757
115
    mlevel2 = mresult->level23;
8758
115
    mlevel3 = mresult->level23 + 16*count2;
8759
115
    memcpy(mlevel1, level1, 32);
8760
115
    memset(mlevel2, 0xFF, 16*count2);
8761
115
    memset(mlevel3, 0, 128*count3);
8762
115
    count3 = 0;
8763
29.4k
    for (i = 1; i < length; i++) {
8764
29.3k
        int o1, o2, o3, i2, i3;
8765
29.3k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8766
29.3k
        if (ch == 0xFFFE)
8767
            /* unmapped character */
8768
773
            continue;
8769
28.5k
        o1 = ch>>11;
8770
28.5k
        o2 = (ch>>7) & 0xF;
8771
28.5k
        i2 = 16*mlevel1[o1] + o2;
8772
28.5k
        if (mlevel2[i2] == 0xFF)
8773
625
            mlevel2[i2] = count3++;
8774
28.5k
        o3 = ch & 0x7F;
8775
28.5k
        i3 = 128*mlevel2[i2] + o3;
8776
28.5k
        mlevel3[i3] = i;
8777
28.5k
    }
8778
115
    return result;
8779
115
}
8780
8781
static int
8782
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8783
0
{
8784
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8785
0
    int l1 = c>>11;
8786
0
    int l2 = (c>>7) & 0xF;
8787
0
    int l3 = c & 0x7F;
8788
0
    int i;
8789
8790
0
    if (c > 0xFFFF)
8791
0
        return -1;
8792
0
    if (c == 0)
8793
0
        return 0;
8794
    /* level 1*/
8795
0
    i = map->level1[l1];
8796
0
    if (i == 0xFF) {
8797
0
        return -1;
8798
0
    }
8799
    /* level 2*/
8800
0
    i = map->level23[16*i+l2];
8801
0
    if (i == 0xFF) {
8802
0
        return -1;
8803
0
    }
8804
    /* level 3 */
8805
0
    i = map->level23[16*map->count2 + 128*i + l3];
8806
0
    if (i == 0) {
8807
0
        return -1;
8808
0
    }
8809
0
    return i;
8810
0
}
8811
8812
/* Lookup the character in the mapping.
8813
   On success, return PyLong, PyBytes or None (if the character can't be found).
8814
   If the result is PyLong, put its value in replace.
8815
   On error, return NULL.
8816
   */
8817
static PyObject *
8818
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8819
0
{
8820
0
    PyObject *w = PyLong_FromLong((long)c);
8821
0
    PyObject *x;
8822
8823
0
    if (w == NULL)
8824
0
        return NULL;
8825
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8826
0
    Py_DECREF(w);
8827
0
    if (rc == 0) {
8828
        /* No mapping found means: mapping is undefined. */
8829
0
        Py_RETURN_NONE;
8830
0
    }
8831
0
    if (x == NULL) {
8832
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8833
            /* No mapping found means: mapping is undefined. */
8834
0
            PyErr_Clear();
8835
0
            Py_RETURN_NONE;
8836
0
        } else
8837
0
            return NULL;
8838
0
    }
8839
0
    else if (x == Py_None)
8840
0
        return x;
8841
0
    else if (PyLong_Check(x)) {
8842
0
        long value = PyLong_AsLong(x);
8843
0
        if (value < 0 || value > 255) {
8844
0
            PyErr_SetString(PyExc_TypeError,
8845
0
                            "character mapping must be in range(256)");
8846
0
            Py_DECREF(x);
8847
0
            return NULL;
8848
0
        }
8849
0
        *replace = (unsigned char)value;
8850
0
        return x;
8851
0
    }
8852
0
    else if (PyBytes_Check(x))
8853
0
        return x;
8854
0
    else {
8855
        /* wrong return value */
8856
0
        PyErr_Format(PyExc_TypeError,
8857
0
                     "character mapping must return integer, bytes or None, not %.400s",
8858
0
                     Py_TYPE(x)->tp_name);
8859
0
        Py_DECREF(x);
8860
0
        return NULL;
8861
0
    }
8862
0
}
8863
8864
static int
8865
charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8866
0
{
8867
0
    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8868
    /* exponentially overallocate to minimize reallocations */
8869
0
    if (requiredsize < 2*outsize)
8870
0
        requiredsize = 2*outsize;
8871
0
    if (_PyBytes_Resize(outobj, requiredsize))
8872
0
        return -1;
8873
0
    return 0;
8874
0
}
8875
8876
typedef enum charmapencode_result {
8877
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8878
} charmapencode_result;
8879
/* lookup the character, put the result in the output string and adjust
8880
   various state variables. Resize the output bytes object if not enough
8881
   space is available. Return a new reference to the object that
8882
   was put in the output buffer, or Py_None, if the mapping was undefined
8883
   (in which case no character was written) or NULL, if a
8884
   reallocation error occurred. The caller must decref the result */
8885
static charmapencode_result
8886
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8887
                     PyObject **outobj, Py_ssize_t *outpos)
8888
0
{
8889
0
    PyObject *rep;
8890
0
    unsigned char replace;
8891
0
    char *outstart;
8892
0
    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8893
8894
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8895
0
        int res = encoding_map_lookup(c, mapping);
8896
0
        Py_ssize_t requiredsize = *outpos+1;
8897
0
        if (res == -1)
8898
0
            return enc_FAILED;
8899
0
        if (outsize<requiredsize)
8900
0
            if (charmapencode_resize(outobj, outpos, requiredsize))
8901
0
                return enc_EXCEPTION;
8902
0
        outstart = PyBytes_AS_STRING(*outobj);
8903
0
        outstart[(*outpos)++] = (char)res;
8904
0
        return enc_SUCCESS;
8905
0
    }
8906
8907
0
    rep = charmapencode_lookup(c, mapping, &replace);
8908
0
    if (rep==NULL)
8909
0
        return enc_EXCEPTION;
8910
0
    else if (rep==Py_None) {
8911
0
        Py_DECREF(rep);
8912
0
        return enc_FAILED;
8913
0
    } else {
8914
0
        if (PyLong_Check(rep)) {
8915
0
            Py_ssize_t requiredsize = *outpos+1;
8916
0
            if (outsize<requiredsize)
8917
0
                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8918
0
                    Py_DECREF(rep);
8919
0
                    return enc_EXCEPTION;
8920
0
                }
8921
0
            outstart = PyBytes_AS_STRING(*outobj);
8922
0
            outstart[(*outpos)++] = (char)replace;
8923
0
        }
8924
0
        else {
8925
0
            const char *repchars = PyBytes_AS_STRING(rep);
8926
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8927
0
            Py_ssize_t requiredsize = *outpos+repsize;
8928
0
            if (outsize<requiredsize)
8929
0
                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8930
0
                    Py_DECREF(rep);
8931
0
                    return enc_EXCEPTION;
8932
0
                }
8933
0
            outstart = PyBytes_AS_STRING(*outobj);
8934
0
            memcpy(outstart + *outpos, repchars, repsize);
8935
0
            *outpos += repsize;
8936
0
        }
8937
0
    }
8938
0
    Py_DECREF(rep);
8939
0
    return enc_SUCCESS;
8940
0
}
8941
8942
/* handle an error in PyUnicode_EncodeCharmap
8943
   Return 0 on success, -1 on error */
8944
static int
8945
charmap_encoding_error(
8946
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8947
    PyObject **exceptionObject,
8948
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8949
    PyObject **res, Py_ssize_t *respos)
8950
0
{
8951
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8952
0
    Py_ssize_t size, repsize;
8953
0
    Py_ssize_t newpos;
8954
0
    int kind;
8955
0
    const void *data;
8956
0
    Py_ssize_t index;
8957
    /* startpos for collecting unencodable chars */
8958
0
    Py_ssize_t collstartpos = *inpos;
8959
0
    Py_ssize_t collendpos = *inpos+1;
8960
0
    Py_ssize_t collpos;
8961
0
    const char *encoding = "charmap";
8962
0
    const char *reason = "character maps to <undefined>";
8963
0
    charmapencode_result x;
8964
0
    Py_UCS4 ch;
8965
0
    int val;
8966
8967
0
    size = PyUnicode_GET_LENGTH(unicode);
8968
    /* find all unencodable characters */
8969
0
    while (collendpos < size) {
8970
0
        PyObject *rep;
8971
0
        unsigned char replace;
8972
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8973
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8974
0
            val = encoding_map_lookup(ch, mapping);
8975
0
            if (val != -1)
8976
0
                break;
8977
0
            ++collendpos;
8978
0
            continue;
8979
0
        }
8980
8981
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8982
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8983
0
        if (rep==NULL)
8984
0
            return -1;
8985
0
        else if (rep!=Py_None) {
8986
0
            Py_DECREF(rep);
8987
0
            break;
8988
0
        }
8989
0
        Py_DECREF(rep);
8990
0
        ++collendpos;
8991
0
    }
8992
    /* cache callback name lookup
8993
     * (if not done yet, i.e. it's the first error) */
8994
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8995
0
        *error_handler = _Py_GetErrorHandler(errors);
8996
8997
0
    switch (*error_handler) {
8998
0
    case _Py_ERROR_STRICT:
8999
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
9000
0
        return -1;
9001
9002
0
    case _Py_ERROR_REPLACE:
9003
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
9004
0
            x = charmapencode_output('?', mapping, res, respos);
9005
0
            if (x==enc_EXCEPTION) {
9006
0
                return -1;
9007
0
            }
9008
0
            else if (x==enc_FAILED) {
9009
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
9010
0
                return -1;
9011
0
            }
9012
0
        }
9013
0
        _Py_FALLTHROUGH;
9014
0
    case _Py_ERROR_IGNORE:
9015
0
        *inpos = collendpos;
9016
0
        break;
9017
9018
0
    case _Py_ERROR_XMLCHARREFREPLACE:
9019
        /* generate replacement (temporarily (mis)uses p) */
9020
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
9021
0
            char buffer[2+29+1+1];
9022
0
            char *cp;
9023
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
9024
0
            for (cp = buffer; *cp; ++cp) {
9025
0
                x = charmapencode_output(*cp, mapping, res, respos);
9026
0
                if (x==enc_EXCEPTION)
9027
0
                    return -1;
9028
0
                else if (x==enc_FAILED) {
9029
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
9030
0
                    return -1;
9031
0
                }
9032
0
            }
9033
0
        }
9034
0
        *inpos = collendpos;
9035
0
        break;
9036
9037
0
    default:
9038
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
9039
0
                                                      encoding, reason, unicode, exceptionObject,
9040
0
                                                      collstartpos, collendpos, &newpos);
9041
0
        if (repunicode == NULL)
9042
0
            return -1;
9043
0
        if (PyBytes_Check(repunicode)) {
9044
            /* Directly copy bytes result to output. */
9045
0
            Py_ssize_t outsize = PyBytes_Size(*res);
9046
0
            Py_ssize_t requiredsize;
9047
0
            repsize = PyBytes_Size(repunicode);
9048
0
            requiredsize = *respos + repsize;
9049
0
            if (requiredsize > outsize)
9050
                /* Make room for all additional bytes. */
9051
0
                if (charmapencode_resize(res, respos, requiredsize)) {
9052
0
                    Py_DECREF(repunicode);
9053
0
                    return -1;
9054
0
                }
9055
0
            memcpy(PyBytes_AsString(*res) + *respos,
9056
0
                   PyBytes_AsString(repunicode),  repsize);
9057
0
            *respos += repsize;
9058
0
            *inpos = newpos;
9059
0
            Py_DECREF(repunicode);
9060
0
            break;
9061
0
        }
9062
        /* generate replacement  */
9063
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
9064
0
        data = PyUnicode_DATA(repunicode);
9065
0
        kind = PyUnicode_KIND(repunicode);
9066
0
        for (index = 0; index < repsize; index++) {
9067
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
9068
0
            x = charmapencode_output(repch, mapping, res, respos);
9069
0
            if (x==enc_EXCEPTION) {
9070
0
                Py_DECREF(repunicode);
9071
0
                return -1;
9072
0
            }
9073
0
            else if (x==enc_FAILED) {
9074
0
                Py_DECREF(repunicode);
9075
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
9076
0
                return -1;
9077
0
            }
9078
0
        }
9079
0
        *inpos = newpos;
9080
0
        Py_DECREF(repunicode);
9081
0
    }
9082
0
    return 0;
9083
0
}
9084
9085
PyObject *
9086
_PyUnicode_EncodeCharmap(PyObject *unicode,
9087
                         PyObject *mapping,
9088
                         const char *errors)
9089
0
{
9090
    /* output object */
9091
0
    PyObject *res = NULL;
9092
    /* current input position */
9093
0
    Py_ssize_t inpos = 0;
9094
0
    Py_ssize_t size;
9095
    /* current output position */
9096
0
    Py_ssize_t respos = 0;
9097
0
    PyObject *error_handler_obj = NULL;
9098
0
    PyObject *exc = NULL;
9099
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
9100
0
    const void *data;
9101
0
    int kind;
9102
9103
0
    size = PyUnicode_GET_LENGTH(unicode);
9104
0
    data = PyUnicode_DATA(unicode);
9105
0
    kind = PyUnicode_KIND(unicode);
9106
9107
    /* Default to Latin-1 */
9108
0
    if (mapping == NULL)
9109
0
        return unicode_encode_ucs1(unicode, errors, 256);
9110
9111
    /* allocate enough for a simple encoding without
9112
       replacements, if we need more, we'll resize */
9113
0
    res = PyBytes_FromStringAndSize(NULL, size);
9114
0
    if (res == NULL)
9115
0
        goto onError;
9116
0
    if (size == 0)
9117
0
        return res;
9118
9119
0
    while (inpos<size) {
9120
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9121
        /* try to encode it */
9122
0
        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
9123
0
        if (x==enc_EXCEPTION) /* error */
9124
0
            goto onError;
9125
0
        if (x==enc_FAILED) { /* unencodable character */
9126
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
9127
0
                                       &exc,
9128
0
                                       &error_handler, &error_handler_obj, errors,
9129
0
                                       &res, &respos)) {
9130
0
                goto onError;
9131
0
            }
9132
0
        }
9133
0
        else
9134
            /* done with this character => adjust input position */
9135
0
            ++inpos;
9136
0
    }
9137
9138
    /* Resize if we allocated to much */
9139
0
    if (respos<PyBytes_GET_SIZE(res))
9140
0
        if (_PyBytes_Resize(&res, respos) < 0)
9141
0
            goto onError;
9142
9143
0
    Py_XDECREF(exc);
9144
0
    Py_XDECREF(error_handler_obj);
9145
0
    return res;
9146
9147
0
  onError:
9148
0
    Py_XDECREF(res);
9149
0
    Py_XDECREF(exc);
9150
0
    Py_XDECREF(error_handler_obj);
9151
0
    return NULL;
9152
0
}
9153
9154
PyObject *
9155
PyUnicode_AsCharmapString(PyObject *unicode,
9156
                          PyObject *mapping)
9157
0
{
9158
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9159
0
        PyErr_BadArgument();
9160
0
        return NULL;
9161
0
    }
9162
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9163
0
}
9164
9165
/* create or adjust a UnicodeTranslateError */
9166
static void
9167
make_translate_exception(PyObject **exceptionObject,
9168
                         PyObject *unicode,
9169
                         Py_ssize_t startpos, Py_ssize_t endpos,
9170
                         const char *reason)
9171
0
{
9172
0
    if (*exceptionObject == NULL) {
9173
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9174
0
            unicode, startpos, endpos, reason);
9175
0
    }
9176
0
    else {
9177
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9178
0
            goto onError;
9179
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9180
0
            goto onError;
9181
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9182
0
            goto onError;
9183
0
        return;
9184
0
      onError:
9185
0
        Py_CLEAR(*exceptionObject);
9186
0
    }
9187
0
}
9188
9189
/* error handling callback helper:
9190
   build arguments, call the callback and check the arguments,
9191
   put the result into newpos and return the replacement string, which
9192
   has to be freed by the caller */
9193
static PyObject *
9194
unicode_translate_call_errorhandler(const char *errors,
9195
                                    PyObject **errorHandler,
9196
                                    const char *reason,
9197
                                    PyObject *unicode, PyObject **exceptionObject,
9198
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9199
                                    Py_ssize_t *newpos)
9200
0
{
9201
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9202
9203
0
    Py_ssize_t i_newpos;
9204
0
    PyObject *restuple;
9205
0
    PyObject *resunicode;
9206
9207
0
    if (*errorHandler == NULL) {
9208
0
        *errorHandler = PyCodec_LookupError(errors);
9209
0
        if (*errorHandler == NULL)
9210
0
            return NULL;
9211
0
    }
9212
9213
0
    make_translate_exception(exceptionObject,
9214
0
                             unicode, startpos, endpos, reason);
9215
0
    if (*exceptionObject == NULL)
9216
0
        return NULL;
9217
9218
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9219
0
    if (restuple == NULL)
9220
0
        return NULL;
9221
0
    if (!PyTuple_Check(restuple)) {
9222
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9223
0
        Py_DECREF(restuple);
9224
0
        return NULL;
9225
0
    }
9226
0
    if (!PyArg_ParseTuple(restuple, argparse,
9227
0
                          &resunicode, &i_newpos)) {
9228
0
        Py_DECREF(restuple);
9229
0
        return NULL;
9230
0
    }
9231
0
    if (i_newpos<0)
9232
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9233
0
    else
9234
0
        *newpos = i_newpos;
9235
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9236
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9237
0
        Py_DECREF(restuple);
9238
0
        return NULL;
9239
0
    }
9240
0
    Py_INCREF(resunicode);
9241
0
    Py_DECREF(restuple);
9242
0
    return resunicode;
9243
0
}
9244
9245
/* Lookup the character ch in the mapping and put the result in result,
9246
   which must be decrefed by the caller.
9247
   The result can be PyLong, PyUnicode, None or NULL.
9248
   If the result is PyLong, put its value in replace.
9249
   Return 0 on success, -1 on error */
9250
static int
9251
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9252
150
{
9253
150
    PyObject *w = PyLong_FromLong((long)c);
9254
150
    PyObject *x;
9255
9256
150
    if (w == NULL)
9257
0
        return -1;
9258
150
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9259
150
    Py_DECREF(w);
9260
150
    if (rc == 0) {
9261
        /* No mapping found means: use 1:1 mapping. */
9262
74
        *result = NULL;
9263
74
        return 0;
9264
74
    }
9265
76
    if (x == NULL) {
9266
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9267
            /* No mapping found means: use 1:1 mapping. */
9268
0
            PyErr_Clear();
9269
0
            *result = NULL;
9270
0
            return 0;
9271
0
        } else
9272
0
            return -1;
9273
0
    }
9274
76
    else if (x == Py_None) {
9275
0
        *result = x;
9276
0
        return 0;
9277
0
    }
9278
76
    else if (PyLong_Check(x)) {
9279
0
        long value = PyLong_AsLong(x);
9280
0
        if (value < 0 || value > MAX_UNICODE) {
9281
0
            PyErr_Format(PyExc_ValueError,
9282
0
                         "character mapping must be in range(0x%x)",
9283
0
                         MAX_UNICODE+1);
9284
0
            Py_DECREF(x);
9285
0
            return -1;
9286
0
        }
9287
0
        *result = x;
9288
0
        *replace = (Py_UCS4)value;
9289
0
        return 0;
9290
0
    }
9291
76
    else if (PyUnicode_Check(x)) {
9292
76
        *result = x;
9293
76
        return 0;
9294
76
    }
9295
0
    else {
9296
        /* wrong return value */
9297
0
        PyErr_SetString(PyExc_TypeError,
9298
0
                        "character mapping must return integer, None or str");
9299
0
        Py_DECREF(x);
9300
0
        return -1;
9301
0
    }
9302
76
}
9303
9304
/* lookup the character, write the result into the writer.
9305
   Return 1 if the result was written into the writer, return 0 if the mapping
9306
   was undefined, raise an exception return -1 on error. */
9307
static int
9308
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9309
                        _PyUnicodeWriter *writer)
9310
133
{
9311
133
    PyObject *item;
9312
133
    Py_UCS4 replace;
9313
9314
133
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9315
0
        return -1;
9316
9317
133
    if (item == NULL) {
9318
        /* not found => default to 1:1 mapping */
9319
65
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9320
0
            return -1;
9321
0
        }
9322
65
        return 1;
9323
65
    }
9324
9325
68
    if (item == Py_None) {
9326
0
        Py_DECREF(item);
9327
0
        return 0;
9328
0
    }
9329
9330
68
    if (PyLong_Check(item)) {
9331
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9332
0
            Py_DECREF(item);
9333
0
            return -1;
9334
0
        }
9335
0
        Py_DECREF(item);
9336
0
        return 1;
9337
0
    }
9338
9339
68
    if (!PyUnicode_Check(item)) {
9340
0
        Py_DECREF(item);
9341
0
        return -1;
9342
0
    }
9343
9344
68
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9345
0
        Py_DECREF(item);
9346
0
        return -1;
9347
0
    }
9348
9349
68
    Py_DECREF(item);
9350
68
    return 1;
9351
68
}
9352
9353
static int
9354
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9355
                              Py_UCS1 *translate)
9356
17
{
9357
17
    PyObject *item = NULL;
9358
17
    Py_UCS4 replace;
9359
17
    int ret = 0;
9360
9361
17
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9362
0
        return -1;
9363
0
    }
9364
9365
17
    if (item == Py_None) {
9366
        /* deletion */
9367
0
        translate[ch] = 0xfe;
9368
0
    }
9369
17
    else if (item == NULL) {
9370
        /* not found => default to 1:1 mapping */
9371
9
        translate[ch] = ch;
9372
9
        return 1;
9373
9
    }
9374
8
    else if (PyLong_Check(item)) {
9375
0
        if (replace > 127) {
9376
            /* invalid character or character outside ASCII:
9377
               skip the fast translate */
9378
0
            goto exit;
9379
0
        }
9380
0
        translate[ch] = (Py_UCS1)replace;
9381
0
    }
9382
8
    else if (PyUnicode_Check(item)) {
9383
8
        if (PyUnicode_GET_LENGTH(item) != 1)
9384
8
            goto exit;
9385
9386
0
        replace = PyUnicode_READ_CHAR(item, 0);
9387
0
        if (replace > 127)
9388
0
            goto exit;
9389
0
        translate[ch] = (Py_UCS1)replace;
9390
0
    }
9391
0
    else {
9392
        /* not None, NULL, long or unicode */
9393
0
        goto exit;
9394
0
    }
9395
0
    ret = 1;
9396
9397
8
  exit:
9398
8
    Py_DECREF(item);
9399
8
    return ret;
9400
0
}
9401
9402
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9403
   was translated into writer, return 0 if the input string was partially
9404
   translated into writer, raise an exception and return -1 on error. */
9405
static int
9406
unicode_fast_translate(PyObject *input, PyObject *mapping,
9407
                       _PyUnicodeWriter *writer, int ignore,
9408
                       Py_ssize_t *input_pos)
9409
8
{
9410
8
    Py_UCS1 ascii_table[128], ch, ch2;
9411
8
    Py_ssize_t len;
9412
8
    const Py_UCS1 *in, *end;
9413
8
    Py_UCS1 *out;
9414
8
    int res = 0;
9415
9416
8
    len = PyUnicode_GET_LENGTH(input);
9417
9418
8
    memset(ascii_table, 0xff, 128);
9419
9420
8
    in = PyUnicode_1BYTE_DATA(input);
9421
8
    end = in + len;
9422
9423
8
    assert(PyUnicode_IS_ASCII(writer->buffer));
9424
8
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9425
8
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9426
9427
17
    for (; in < end; in++) {
9428
17
        ch = *in;
9429
17
        ch2 = ascii_table[ch];
9430
17
        if (ch2 == 0xff) {
9431
17
            int translate = unicode_fast_translate_lookup(mapping, ch,
9432
17
                                                          ascii_table);
9433
17
            if (translate < 0)
9434
0
                return -1;
9435
17
            if (translate == 0)
9436
8
                goto exit;
9437
9
            ch2 = ascii_table[ch];
9438
9
        }
9439
9
        if (ch2 == 0xfe) {
9440
0
            if (ignore)
9441
0
                continue;
9442
0
            goto exit;
9443
0
        }
9444
9
        assert(ch2 < 128);
9445
9
        *out = ch2;
9446
9
        out++;
9447
9
    }
9448
0
    res = 1;
9449
9450
8
exit:
9451
8
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9452
8
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9453
8
    return res;
9454
0
}
9455
9456
static PyObject *
9457
_PyUnicode_TranslateCharmap(PyObject *input,
9458
                            PyObject *mapping,
9459
                            const char *errors)
9460
8
{
9461
    /* input object */
9462
8
    const void *data;
9463
8
    Py_ssize_t size, i;
9464
8
    int kind;
9465
    /* output buffer */
9466
8
    _PyUnicodeWriter writer;
9467
    /* error handler */
9468
8
    const char *reason = "character maps to <undefined>";
9469
8
    PyObject *errorHandler = NULL;
9470
8
    PyObject *exc = NULL;
9471
8
    int ignore;
9472
8
    int res;
9473
9474
8
    if (mapping == NULL) {
9475
0
        PyErr_BadArgument();
9476
0
        return NULL;
9477
0
    }
9478
9479
8
    data = PyUnicode_DATA(input);
9480
8
    kind = PyUnicode_KIND(input);
9481
8
    size = PyUnicode_GET_LENGTH(input);
9482
9483
8
    if (size == 0)
9484
0
        return PyUnicode_FromObject(input);
9485
9486
    /* allocate enough for a simple 1:1 translation without
9487
       replacements, if we need more, we'll resize */
9488
8
    _PyUnicodeWriter_Init(&writer);
9489
8
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9490
0
        goto onError;
9491
9492
8
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9493
9494
8
    if (PyUnicode_IS_ASCII(input)) {
9495
8
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9496
8
        if (res < 0) {
9497
0
            _PyUnicodeWriter_Dealloc(&writer);
9498
0
            return NULL;
9499
0
        }
9500
8
        if (res == 1)
9501
0
            return _PyUnicodeWriter_Finish(&writer);
9502
8
    }
9503
0
    else {
9504
0
        i = 0;
9505
0
    }
9506
9507
141
    while (i<size) {
9508
        /* try to encode it */
9509
133
        int translate;
9510
133
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9511
133
        Py_ssize_t newpos;
9512
        /* startpos for collecting untranslatable chars */
9513
133
        Py_ssize_t collstart;
9514
133
        Py_ssize_t collend;
9515
133
        Py_UCS4 ch;
9516
9517
133
        ch = PyUnicode_READ(kind, data, i);
9518
133
        translate = charmaptranslate_output(ch, mapping, &writer);
9519
133
        if (translate < 0)
9520
0
            goto onError;
9521
9522
133
        if (translate != 0) {
9523
            /* it worked => adjust input pointer */
9524
133
            ++i;
9525
133
            continue;
9526
133
        }
9527
9528
        /* untranslatable character */
9529
0
        collstart = i;
9530
0
        collend = i+1;
9531
9532
        /* find all untranslatable characters */
9533
0
        while (collend < size) {
9534
0
            PyObject *x;
9535
0
            Py_UCS4 replace;
9536
0
            ch = PyUnicode_READ(kind, data, collend);
9537
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9538
0
                goto onError;
9539
0
            Py_XDECREF(x);
9540
0
            if (x != Py_None)
9541
0
                break;
9542
0
            ++collend;
9543
0
        }
9544
9545
0
        if (ignore) {
9546
0
            i = collend;
9547
0
        }
9548
0
        else {
9549
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9550
0
                                                             reason, input, &exc,
9551
0
                                                             collstart, collend, &newpos);
9552
0
            if (repunicode == NULL)
9553
0
                goto onError;
9554
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9555
0
                Py_DECREF(repunicode);
9556
0
                goto onError;
9557
0
            }
9558
0
            Py_DECREF(repunicode);
9559
0
            i = newpos;
9560
0
        }
9561
0
    }
9562
8
    Py_XDECREF(exc);
9563
8
    Py_XDECREF(errorHandler);
9564
8
    return _PyUnicodeWriter_Finish(&writer);
9565
9566
0
  onError:
9567
0
    _PyUnicodeWriter_Dealloc(&writer);
9568
0
    Py_XDECREF(exc);
9569
0
    Py_XDECREF(errorHandler);
9570
0
    return NULL;
9571
8
}
9572
9573
PyObject *
9574
PyUnicode_Translate(PyObject *str,
9575
                    PyObject *mapping,
9576
                    const char *errors)
9577
0
{
9578
0
    if (ensure_unicode(str) < 0)
9579
0
        return NULL;
9580
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9581
0
}
9582
9583
PyObject *
9584
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9585
3.05M
{
9586
3.05M
    if (!PyUnicode_Check(unicode)) {
9587
0
        PyErr_BadInternalCall();
9588
0
        return NULL;
9589
0
    }
9590
3.05M
    if (PyUnicode_IS_ASCII(unicode)) {
9591
        /* If the string is already ASCII, just return the same string */
9592
3.05M
        return Py_NewRef(unicode);
9593
3.05M
    }
9594
9595
2.64k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9596
2.64k
    PyObject *result = PyUnicode_New(len, 127);
9597
2.64k
    if (result == NULL) {
9598
0
        return NULL;
9599
0
    }
9600
9601
2.64k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9602
2.64k
    int kind = PyUnicode_KIND(unicode);
9603
2.64k
    const void *data = PyUnicode_DATA(unicode);
9604
2.64k
    Py_ssize_t i;
9605
58.4k
    for (i = 0; i < len; ++i) {
9606
55.9k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9607
55.9k
        if (ch < 127) {
9608
53.0k
            out[i] = ch;
9609
53.0k
        }
9610
2.87k
        else if (Py_UNICODE_ISSPACE(ch)) {
9611
750
            out[i] = ' ';
9612
750
        }
9613
2.12k
        else {
9614
2.12k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9615
2.12k
            if (decimal < 0) {
9616
151
                out[i] = '?';
9617
151
                out[i+1] = '\0';
9618
151
                _PyUnicode_LENGTH(result) = i + 1;
9619
151
                break;
9620
151
            }
9621
1.97k
            out[i] = '0' + decimal;
9622
1.97k
        }
9623
55.9k
    }
9624
9625
2.64k
    assert(_PyUnicode_CheckConsistency(result, 1));
9626
2.64k
    return result;
9627
2.64k
}
9628
9629
/* --- Helpers ------------------------------------------------------------ */
9630
9631
/* helper macro to fixup start/end slice values */
9632
#define ADJUST_INDICES(start, end, len) \
9633
121M
    do {                                \
9634
121M
        if (end > len) {                \
9635
111M
            end = len;                  \
9636
111M
        }                               \
9637
121M
        else if (end < 0) {             \
9638
0
            end += len;                 \
9639
0
            if (end < 0) {              \
9640
0
                end = 0;                \
9641
0
            }                           \
9642
0
        }                               \
9643
121M
        if (start < 0) {                \
9644
0
            start += len;               \
9645
0
            if (start < 0) {            \
9646
0
                start = 0;              \
9647
0
            }                           \
9648
0
        }                               \
9649
121M
    } while (0)
9650
9651
static Py_ssize_t
9652
any_find_slice(PyObject* s1, PyObject* s2,
9653
               Py_ssize_t start,
9654
               Py_ssize_t end,
9655
               int direction)
9656
13.2M
{
9657
13.2M
    int kind1, kind2;
9658
13.2M
    const void *buf1, *buf2;
9659
13.2M
    Py_ssize_t len1, len2, result;
9660
9661
13.2M
    kind1 = PyUnicode_KIND(s1);
9662
13.2M
    kind2 = PyUnicode_KIND(s2);
9663
13.2M
    if (kind1 < kind2)
9664
0
        return -1;
9665
9666
13.2M
    len1 = PyUnicode_GET_LENGTH(s1);
9667
13.2M
    len2 = PyUnicode_GET_LENGTH(s2);
9668
13.2M
    ADJUST_INDICES(start, end, len1);
9669
13.2M
    if (end - start < len2)
9670
17.7k
        return -1;
9671
9672
13.2M
    buf1 = PyUnicode_DATA(s1);
9673
13.2M
    buf2 = PyUnicode_DATA(s2);
9674
13.2M
    if (len2 == 1) {
9675
13.2M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9676
13.2M
        result = findchar((const char *)buf1 + kind1*start,
9677
13.2M
                          kind1, end - start, ch, direction);
9678
13.2M
        if (result == -1)
9679
174k
            return -1;
9680
13.0M
        else
9681
13.0M
            return start + result;
9682
13.2M
    }
9683
9684
0
    if (kind2 != kind1) {
9685
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9686
0
        if (!buf2)
9687
0
            return -2;
9688
0
    }
9689
9690
0
    if (direction > 0) {
9691
0
        switch (kind1) {
9692
0
        case PyUnicode_1BYTE_KIND:
9693
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9694
0
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9695
0
            else
9696
0
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9697
0
            break;
9698
0
        case PyUnicode_2BYTE_KIND:
9699
0
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9700
0
            break;
9701
0
        case PyUnicode_4BYTE_KIND:
9702
0
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9703
0
            break;
9704
0
        default:
9705
0
            Py_UNREACHABLE();
9706
0
        }
9707
0
    }
9708
0
    else {
9709
0
        switch (kind1) {
9710
0
        case PyUnicode_1BYTE_KIND:
9711
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9712
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9713
0
            else
9714
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9715
0
            break;
9716
0
        case PyUnicode_2BYTE_KIND:
9717
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9718
0
            break;
9719
0
        case PyUnicode_4BYTE_KIND:
9720
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9721
0
            break;
9722
0
        default:
9723
0
            Py_UNREACHABLE();
9724
0
        }
9725
0
    }
9726
9727
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9728
0
    if (kind2 != kind1)
9729
0
        PyMem_Free((void *)buf2);
9730
9731
0
    return result;
9732
0
}
9733
9734
/* _PyUnicode_InsertThousandsGrouping() helper functions */
9735
#include "stringlib/localeutil.h"
9736
9737
/**
9738
 * InsertThousandsGrouping:
9739
 * @writer: Unicode writer.
9740
 * @n_buffer: Number of characters in @buffer.
9741
 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9742
 * @d_pos: Start of digits string.
9743
 * @n_digits: The number of digits in the string, in which we want
9744
 *            to put the grouping chars.
9745
 * @min_width: The minimum width of the digits in the output string.
9746
 *             Output will be zero-padded on the left to fill.
9747
 * @grouping: see definition in localeconv().
9748
 * @thousands_sep: see definition in localeconv().
9749
 *
9750
 * There are 2 modes: counting and filling. If @writer is NULL,
9751
 *  we are in counting mode, else filling mode.
9752
 * If counting, the required buffer size is returned.
9753
 * If filling, we know the buffer will be large enough, so we don't
9754
 *  need to pass in the buffer size.
9755
 * Inserts thousand grouping characters (as defined by grouping and
9756
 *  thousands_sep) into @writer.
9757
 *
9758
 * Return value: -1 on error, number of characters otherwise.
9759
 **/
9760
Py_ssize_t
9761
_PyUnicode_InsertThousandsGrouping(
9762
    _PyUnicodeWriter *writer,
9763
    Py_ssize_t n_buffer,
9764
    PyObject *digits,
9765
    Py_ssize_t d_pos,
9766
    Py_ssize_t n_digits,
9767
    Py_ssize_t min_width,
9768
    const char *grouping,
9769
    PyObject *thousands_sep,
9770
    Py_UCS4 *maxchar,
9771
    int forward)
9772
128
{
9773
128
    min_width = Py_MAX(0, min_width);
9774
128
    if (writer) {
9775
64
        assert(digits != NULL);
9776
64
        assert(maxchar == NULL);
9777
64
    }
9778
64
    else {
9779
64
        assert(digits == NULL);
9780
64
        assert(maxchar != NULL);
9781
64
    }
9782
128
    assert(0 <= d_pos);
9783
128
    assert(0 <= n_digits);
9784
128
    assert(grouping != NULL);
9785
9786
128
    Py_ssize_t count = 0;
9787
128
    Py_ssize_t n_zeros;
9788
128
    int loop_broken = 0;
9789
128
    int use_separator = 0; /* First time through, don't append the
9790
                              separator. They only go between
9791
                              groups. */
9792
128
    Py_ssize_t buffer_pos;
9793
128
    Py_ssize_t digits_pos;
9794
128
    Py_ssize_t len;
9795
128
    Py_ssize_t n_chars;
9796
128
    Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9797
                                        be looked at */
9798
    /* A generator that returns all of the grouping widths, until it
9799
       returns 0. */
9800
128
    GroupGenerator groupgen;
9801
128
    GroupGenerator_init(&groupgen, grouping);
9802
128
    const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9803
9804
    /* if digits are not grouped, thousands separator
9805
       should be an empty string */
9806
128
    assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9807
9808
128
    digits_pos = d_pos + (forward ? 0 : n_digits);
9809
128
    if (writer) {
9810
64
        buffer_pos = writer->pos + (forward ? 0 : n_buffer);
9811
64
        assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9812
64
        assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9813
64
    }
9814
64
    else {
9815
64
        buffer_pos = forward ? 0 : n_buffer;
9816
64
    }
9817
9818
128
    if (!writer) {
9819
64
        *maxchar = 127;
9820
64
    }
9821
9822
128
    while ((len = GroupGenerator_next(&groupgen)) > 0) {
9823
0
        len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9824
0
        n_zeros = Py_MAX(0, len - remaining);
9825
0
        n_chars = Py_MAX(0, Py_MIN(remaining, len));
9826
9827
        /* Use n_zero zero's and n_chars chars */
9828
9829
        /* Count only, don't do anything. */
9830
0
        count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9831
9832
        /* Copy into the writer. */
9833
0
        InsertThousandsGrouping_fill(writer, &buffer_pos,
9834
0
                                     digits, &digits_pos,
9835
0
                                     n_chars, n_zeros,
9836
0
                                     use_separator ? thousands_sep : NULL,
9837
0
                                     thousands_sep_len, maxchar, forward);
9838
9839
        /* Use a separator next time. */
9840
0
        use_separator = 1;
9841
9842
0
        remaining -= n_chars;
9843
0
        min_width -= len;
9844
9845
0
        if (remaining <= 0 && min_width <= 0) {
9846
0
            loop_broken = 1;
9847
0
            break;
9848
0
        }
9849
0
        min_width -= thousands_sep_len;
9850
0
    }
9851
128
    if (!loop_broken) {
9852
        /* We left the loop without using a break statement. */
9853
9854
128
        len = Py_MAX(Py_MAX(remaining, min_width), 1);
9855
128
        n_zeros = Py_MAX(0, len - remaining);
9856
128
        n_chars = Py_MAX(0, Py_MIN(remaining, len));
9857
9858
        /* Use n_zero zero's and n_chars chars */
9859
128
        count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9860
9861
        /* Copy into the writer. */
9862
128
        InsertThousandsGrouping_fill(writer, &buffer_pos,
9863
128
                                     digits, &digits_pos,
9864
128
                                     n_chars, n_zeros,
9865
128
                                     use_separator ? thousands_sep : NULL,
9866
128
                                     thousands_sep_len, maxchar, forward);
9867
128
    }
9868
128
    return count;
9869
128
}
9870
9871
Py_ssize_t
9872
PyUnicode_Count(PyObject *str,
9873
                PyObject *substr,
9874
                Py_ssize_t start,
9875
                Py_ssize_t end)
9876
0
{
9877
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9878
0
        return -1;
9879
9880
0
    return unicode_count_impl(str, substr, start, end);
9881
0
}
9882
9883
Py_ssize_t
9884
PyUnicode_Find(PyObject *str,
9885
               PyObject *substr,
9886
               Py_ssize_t start,
9887
               Py_ssize_t end,
9888
               int direction)
9889
0
{
9890
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9891
0
        return -2;
9892
9893
0
    return any_find_slice(str, substr, start, end, direction);
9894
0
}
9895
9896
Py_ssize_t
9897
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9898
                   Py_ssize_t start, Py_ssize_t end,
9899
                   int direction)
9900
440k
{
9901
440k
    int kind;
9902
440k
    Py_ssize_t len, result;
9903
440k
    len = PyUnicode_GET_LENGTH(str);
9904
440k
    ADJUST_INDICES(start, end, len);
9905
440k
    if (end - start < 1)
9906
0
        return -1;
9907
440k
    kind = PyUnicode_KIND(str);
9908
440k
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9909
440k
                      kind, end-start, ch, direction);
9910
440k
    if (result == -1)
9911
47.9k
        return -1;
9912
392k
    else
9913
392k
        return start + result;
9914
440k
}
9915
9916
static int
9917
tailmatch(PyObject *self,
9918
          PyObject *substring,
9919
          Py_ssize_t start,
9920
          Py_ssize_t end,
9921
          int direction)
9922
92.3M
{
9923
92.3M
    int kind_self;
9924
92.3M
    int kind_sub;
9925
92.3M
    const void *data_self;
9926
92.3M
    const void *data_sub;
9927
92.3M
    Py_ssize_t offset;
9928
92.3M
    Py_ssize_t i;
9929
92.3M
    Py_ssize_t end_sub;
9930
9931
92.3M
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9932
92.3M
    end -= PyUnicode_GET_LENGTH(substring);
9933
92.3M
    if (end < start)
9934
11.0M
        return 0;
9935
9936
81.3M
    if (PyUnicode_GET_LENGTH(substring) == 0)
9937
0
        return 1;
9938
9939
81.3M
    kind_self = PyUnicode_KIND(self);
9940
81.3M
    data_self = PyUnicode_DATA(self);
9941
81.3M
    kind_sub = PyUnicode_KIND(substring);
9942
81.3M
    data_sub = PyUnicode_DATA(substring);
9943
81.3M
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9944
9945
81.3M
    if (direction > 0)
9946
6.38M
        offset = end;
9947
74.9M
    else
9948
74.9M
        offset = start;
9949
9950
81.3M
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9951
81.3M
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9952
81.3M
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9953
41.5M
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9954
        /* If both are of the same kind, memcmp is sufficient */
9955
12.2M
        if (kind_self == kind_sub) {
9956
8.82M
            return ! memcmp((char *)data_self +
9957
8.82M
                                (offset * PyUnicode_KIND(substring)),
9958
8.82M
                            data_sub,
9959
8.82M
                            PyUnicode_GET_LENGTH(substring) *
9960
8.82M
                                PyUnicode_KIND(substring));
9961
8.82M
        }
9962
        /* otherwise we have to compare each character by first accessing it */
9963
3.39M
        else {
9964
            /* We do not need to compare 0 and len(substring)-1 because
9965
               the if statement above ensured already that they are equal
9966
               when we end up here. */
9967
3.48M
            for (i = 1; i < end_sub; ++i) {
9968
100k
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9969
100k
                    PyUnicode_READ(kind_sub, data_sub, i))
9970
9.65k
                    return 0;
9971
100k
            }
9972
3.38M
            return 1;
9973
3.39M
        }
9974
12.2M
    }
9975
9976
69.1M
    return 0;
9977
81.3M
}
9978
9979
Py_ssize_t
9980
PyUnicode_Tailmatch(PyObject *str,
9981
                    PyObject *substr,
9982
                    Py_ssize_t start,
9983
                    Py_ssize_t end,
9984
                    int direction)
9985
0
{
9986
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9987
0
        return -1;
9988
9989
0
    return tailmatch(str, substr, start, end, direction);
9990
0
}
9991
9992
static PyObject *
9993
ascii_upper_or_lower(PyObject *self, int lower)
9994
73.9M
{
9995
73.9M
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9996
73.9M
    const char *data = PyUnicode_DATA(self);
9997
73.9M
    char *resdata;
9998
73.9M
    PyObject *res;
9999
10000
73.9M
    res = PyUnicode_New(len, 127);
10001
73.9M
    if (res == NULL)
10002
0
        return NULL;
10003
73.9M
    resdata = PyUnicode_DATA(res);
10004
73.9M
    if (lower)
10005
73.9M
        _Py_bytes_lower(resdata, data, len);
10006
0
    else
10007
0
        _Py_bytes_upper(resdata, data, len);
10008
73.9M
    return res;
10009
73.9M
}
10010
10011
static Py_UCS4
10012
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
10013
34.9k
{
10014
34.9k
    Py_ssize_t j;
10015
34.9k
    int final_sigma;
10016
34.9k
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
10017
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
10018
10019
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10020
10021
    where ! is a negation and \p{xxx} is a character with property xxx.
10022
    */
10023
82.1k
    for (j = i - 1; j >= 0; j--) {
10024
80.7k
        c = PyUnicode_READ(kind, data, j);
10025
80.7k
        if (!_PyUnicode_IsCaseIgnorable(c))
10026
33.5k
            break;
10027
80.7k
    }
10028
34.9k
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10029
34.9k
    if (final_sigma) {
10030
61.9k
        for (j = i + 1; j < length; j++) {
10031
59.8k
            c = PyUnicode_READ(kind, data, j);
10032
59.8k
            if (!_PyUnicode_IsCaseIgnorable(c))
10033
22.5k
                break;
10034
59.8k
        }
10035
24.6k
        final_sigma = j == length || !_PyUnicode_IsCased(c);
10036
24.6k
    }
10037
34.9k
    return (final_sigma) ? 0x3C2 : 0x3C3;
10038
34.9k
}
10039
10040
static int
10041
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
10042
           Py_UCS4 c, Py_UCS4 *mapped)
10043
83.1M
{
10044
    /* Obscure special case. */
10045
83.1M
    if (c == 0x3A3) {
10046
34.9k
        mapped[0] = handle_capital_sigma(kind, data, length, i);
10047
34.9k
        return 1;
10048
34.9k
    }
10049
83.0M
    return _PyUnicode_ToLowerFull(c, mapped);
10050
83.1M
}
10051
10052
static Py_ssize_t
10053
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10054
0
{
10055
0
    Py_ssize_t i, k = 0;
10056
0
    int n_res, j;
10057
0
    Py_UCS4 c, mapped[3];
10058
10059
0
    c = PyUnicode_READ(kind, data, 0);
10060
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
10061
0
    for (j = 0; j < n_res; j++) {
10062
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
10063
0
        res[k++] = mapped[j];
10064
0
    }
10065
0
    for (i = 1; i < length; i++) {
10066
0
        c = PyUnicode_READ(kind, data, i);
10067
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
10068
0
        for (j = 0; j < n_res; j++) {
10069
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
10070
0
            res[k++] = mapped[j];
10071
0
        }
10072
0
    }
10073
0
    return k;
10074
0
}
10075
10076
static Py_ssize_t
10077
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
10078
0
    Py_ssize_t i, k = 0;
10079
10080
0
    for (i = 0; i < length; i++) {
10081
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10082
0
        int n_res, j;
10083
0
        if (Py_UNICODE_ISUPPER(c)) {
10084
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
10085
0
        }
10086
0
        else if (Py_UNICODE_ISLOWER(c)) {
10087
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
10088
0
        }
10089
0
        else {
10090
0
            n_res = 1;
10091
0
            mapped[0] = c;
10092
0
        }
10093
0
        for (j = 0; j < n_res; j++) {
10094
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
10095
0
            res[k++] = mapped[j];
10096
0
        }
10097
0
    }
10098
0
    return k;
10099
0
}
10100
10101
static Py_ssize_t
10102
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
10103
                  Py_UCS4 *maxchar, int lower)
10104
16.3M
{
10105
16.3M
    Py_ssize_t i, k = 0;
10106
10107
99.4M
    for (i = 0; i < length; i++) {
10108
83.1M
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10109
83.1M
        int n_res, j;
10110
83.1M
        if (lower)
10111
83.1M
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
10112
0
        else
10113
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
10114
166M
        for (j = 0; j < n_res; j++) {
10115
83.1M
            *maxchar = Py_MAX(*maxchar, mapped[j]);
10116
83.1M
            res[k++] = mapped[j];
10117
83.1M
        }
10118
83.1M
    }
10119
16.3M
    return k;
10120
16.3M
}
10121
10122
static Py_ssize_t
10123
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10124
0
{
10125
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10126
0
}
10127
10128
static Py_ssize_t
10129
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10130
16.3M
{
10131
16.3M
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10132
16.3M
}
10133
10134
static Py_ssize_t
10135
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10136
0
{
10137
0
    Py_ssize_t i, k = 0;
10138
10139
0
    for (i = 0; i < length; i++) {
10140
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
10141
0
        Py_UCS4 mapped[3];
10142
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10143
0
        for (j = 0; j < n_res; j++) {
10144
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
10145
0
            res[k++] = mapped[j];
10146
0
        }
10147
0
    }
10148
0
    return k;
10149
0
}
10150
10151
static Py_ssize_t
10152
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10153
0
{
10154
0
    Py_ssize_t i, k = 0;
10155
0
    int previous_is_cased;
10156
10157
0
    previous_is_cased = 0;
10158
0
    for (i = 0; i < length; i++) {
10159
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10160
0
        Py_UCS4 mapped[3];
10161
0
        int n_res, j;
10162
10163
0
        if (previous_is_cased)
10164
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
10165
0
        else
10166
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
10167
10168
0
        for (j = 0; j < n_res; j++) {
10169
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
10170
0
            res[k++] = mapped[j];
10171
0
        }
10172
10173
0
        previous_is_cased = _PyUnicode_IsCased(c);
10174
0
    }
10175
0
    return k;
10176
0
}
10177
10178
static PyObject *
10179
case_operation(PyObject *self,
10180
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
10181
16.3M
{
10182
16.3M
    PyObject *res = NULL;
10183
16.3M
    Py_ssize_t length, newlength = 0;
10184
16.3M
    int kind, outkind;
10185
16.3M
    const void *data;
10186
16.3M
    void *outdata;
10187
16.3M
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
10188
10189
16.3M
    kind = PyUnicode_KIND(self);
10190
16.3M
    data = PyUnicode_DATA(self);
10191
16.3M
    length = PyUnicode_GET_LENGTH(self);
10192
16.3M
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
10193
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
10194
0
        return NULL;
10195
0
    }
10196
16.3M
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
10197
16.3M
    if (tmp == NULL)
10198
0
        return PyErr_NoMemory();
10199
16.3M
    newlength = perform(kind, data, length, tmp, &maxchar);
10200
16.3M
    res = PyUnicode_New(newlength, maxchar);
10201
16.3M
    if (res == NULL)
10202
0
        goto leave;
10203
16.3M
    tmpend = tmp + newlength;
10204
16.3M
    outdata = PyUnicode_DATA(res);
10205
16.3M
    outkind = PyUnicode_KIND(res);
10206
16.3M
    switch (outkind) {
10207
187k
    case PyUnicode_1BYTE_KIND:
10208
187k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10209
187k
        break;
10210
16.1M
    case PyUnicode_2BYTE_KIND:
10211
16.1M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10212
16.1M
        break;
10213
47.1k
    case PyUnicode_4BYTE_KIND:
10214
47.1k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10215
47.1k
        break;
10216
0
    default:
10217
0
        Py_UNREACHABLE();
10218
16.3M
    }
10219
16.3M
  leave:
10220
16.3M
    PyMem_Free(tmp);
10221
16.3M
    return res;
10222
16.3M
}
10223
10224
PyObject *
10225
PyUnicode_Join(PyObject *separator, PyObject *seq)
10226
20.0M
{
10227
20.0M
    PyObject *res;
10228
20.0M
    PyObject *fseq;
10229
20.0M
    Py_ssize_t seqlen;
10230
20.0M
    PyObject **items;
10231
10232
20.0M
    fseq = PySequence_Fast(seq, "can only join an iterable");
10233
20.0M
    if (fseq == NULL) {
10234
691
        return NULL;
10235
691
    }
10236
10237
20.0M
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
10238
10239
20.0M
    items = PySequence_Fast_ITEMS(fseq);
10240
20.0M
    seqlen = PySequence_Fast_GET_SIZE(fseq);
10241
20.0M
    res = _PyUnicode_JoinArray(separator, items, seqlen);
10242
10243
20.0M
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
10244
10245
20.0M
    Py_DECREF(fseq);
10246
20.0M
    return res;
10247
20.0M
}
10248
10249
PyObject *
10250
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
10251
43.2M
{
10252
43.2M
    PyObject *res = NULL; /* the result */
10253
43.2M
    PyObject *sep = NULL;
10254
43.2M
    Py_ssize_t seplen;
10255
43.2M
    PyObject *item;
10256
43.2M
    Py_ssize_t sz, i, res_offset;
10257
43.2M
    Py_UCS4 maxchar;
10258
43.2M
    Py_UCS4 item_maxchar;
10259
43.2M
    int use_memcpy;
10260
43.2M
    unsigned char *res_data = NULL, *sep_data = NULL;
10261
43.2M
    PyObject *last_obj;
10262
43.2M
    int kind = 0;
10263
10264
    /* If empty sequence, return u"". */
10265
43.2M
    if (seqlen == 0) {
10266
4.87M
        _Py_RETURN_UNICODE_EMPTY();
10267
4.87M
    }
10268
10269
    /* If singleton sequence with an exact Unicode, return that. */
10270
38.3M
    last_obj = NULL;
10271
38.3M
    if (seqlen == 1) {
10272
6.10M
        if (PyUnicode_CheckExact(items[0])) {
10273
4.64M
            res = items[0];
10274
4.64M
            return Py_NewRef(res);
10275
4.64M
        }
10276
1.46M
        seplen = 0;
10277
1.46M
        maxchar = 0;
10278
1.46M
    }
10279
32.2M
    else {
10280
        /* Set up sep and seplen */
10281
32.2M
        if (separator == NULL) {
10282
            /* fall back to a blank space separator */
10283
0
            sep = PyUnicode_FromOrdinal(' ');
10284
0
            if (!sep)
10285
0
                goto onError;
10286
0
            seplen = 1;
10287
0
            maxchar = 32;
10288
0
        }
10289
32.2M
        else {
10290
32.2M
            if (!PyUnicode_Check(separator)) {
10291
0
                PyErr_Format(PyExc_TypeError,
10292
0
                             "separator: expected str instance,"
10293
0
                             " %.80s found",
10294
0
                             Py_TYPE(separator)->tp_name);
10295
0
                goto onError;
10296
0
            }
10297
32.2M
            sep = separator;
10298
32.2M
            seplen = PyUnicode_GET_LENGTH(separator);
10299
32.2M
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10300
            /* inc refcount to keep this code path symmetric with the
10301
               above case of a blank separator */
10302
32.2M
            Py_INCREF(sep);
10303
32.2M
        }
10304
32.2M
        last_obj = sep;
10305
32.2M
    }
10306
10307
    /* There are at least two things to join, or else we have a subclass
10308
     * of str in the sequence.
10309
     * Do a pre-pass to figure out the total amount of space we'll
10310
     * need (sz), and see whether all argument are strings.
10311
     */
10312
33.7M
    sz = 0;
10313
#ifdef Py_DEBUG
10314
    use_memcpy = 0;
10315
#else
10316
33.7M
    use_memcpy = 1;
10317
33.7M
#endif
10318
329M
    for (i = 0; i < seqlen; i++) {
10319
296M
        size_t add_sz;
10320
296M
        item = items[i];
10321
296M
        if (!PyUnicode_Check(item)) {
10322
0
            PyErr_Format(PyExc_TypeError,
10323
0
                         "sequence item %zd: expected str instance,"
10324
0
                         " %.80s found",
10325
0
                         i, Py_TYPE(item)->tp_name);
10326
0
            goto onError;
10327
0
        }
10328
296M
        add_sz = PyUnicode_GET_LENGTH(item);
10329
296M
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10330
296M
        maxchar = Py_MAX(maxchar, item_maxchar);
10331
296M
        if (i != 0) {
10332
262M
            add_sz += seplen;
10333
262M
        }
10334
296M
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10335
0
            PyErr_SetString(PyExc_OverflowError,
10336
0
                            "join() result is too long for a Python string");
10337
0
            goto onError;
10338
0
        }
10339
296M
        sz += add_sz;
10340
296M
        if (use_memcpy && last_obj != NULL) {
10341
231M
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10342
3.60M
                use_memcpy = 0;
10343
231M
        }
10344
296M
        last_obj = item;
10345
296M
    }
10346
10347
33.7M
    res = PyUnicode_New(sz, maxchar);
10348
33.7M
    if (res == NULL)
10349
0
        goto onError;
10350
10351
    /* Catenate everything. */
10352
#ifdef Py_DEBUG
10353
    use_memcpy = 0;
10354
#else
10355
33.7M
    if (use_memcpy) {
10356
30.1M
        res_data = PyUnicode_1BYTE_DATA(res);
10357
30.1M
        kind = PyUnicode_KIND(res);
10358
30.1M
        if (seplen != 0)
10359
14.7k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10360
30.1M
    }
10361
33.7M
#endif
10362
33.7M
    if (use_memcpy) {
10363
236M
        for (i = 0; i < seqlen; ++i) {
10364
206M
            Py_ssize_t itemlen;
10365
206M
            item = items[i];
10366
10367
            /* Copy item, and maybe the separator. */
10368
206M
            if (i && seplen != 0) {
10369
19.4k
                memcpy(res_data,
10370
19.4k
                          sep_data,
10371
19.4k
                          kind * seplen);
10372
19.4k
                res_data += kind * seplen;
10373
19.4k
            }
10374
10375
206M
            itemlen = PyUnicode_GET_LENGTH(item);
10376
206M
            if (itemlen != 0) {
10377
184M
                memcpy(res_data,
10378
184M
                          PyUnicode_DATA(item),
10379
184M
                          kind * itemlen);
10380
184M
                res_data += kind * itemlen;
10381
184M
            }
10382
206M
        }
10383
30.1M
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10384
30.1M
                           + kind * PyUnicode_GET_LENGTH(res));
10385
30.1M
    }
10386
3.60M
    else {
10387
93.6M
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10388
90.0M
            Py_ssize_t itemlen;
10389
90.0M
            item = items[i];
10390
10391
            /* Copy item, and maybe the separator. */
10392
90.0M
            if (i && seplen != 0) {
10393
59.1k
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10394
59.1k
                res_offset += seplen;
10395
59.1k
            }
10396
10397
90.0M
            itemlen = PyUnicode_GET_LENGTH(item);
10398
90.0M
            if (itemlen != 0) {
10399
89.6M
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10400
89.6M
                res_offset += itemlen;
10401
89.6M
            }
10402
90.0M
        }
10403
3.60M
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10404
3.60M
    }
10405
10406
33.7M
    Py_XDECREF(sep);
10407
33.7M
    assert(_PyUnicode_CheckConsistency(res, 1));
10408
33.7M
    return res;
10409
10410
0
  onError:
10411
0
    Py_XDECREF(sep);
10412
0
    Py_XDECREF(res);
10413
0
    return NULL;
10414
33.7M
}
10415
10416
void
10417
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10418
                    Py_UCS4 fill_char)
10419
672
{
10420
672
    const int kind = PyUnicode_KIND(unicode);
10421
672
    void *data = PyUnicode_DATA(unicode);
10422
672
    assert(unicode_modifiable(unicode));
10423
672
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10424
672
    assert(start >= 0);
10425
672
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10426
672
    unicode_fill(kind, data, fill_char, start, length);
10427
672
}
10428
10429
Py_ssize_t
10430
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10431
               Py_UCS4 fill_char)
10432
672
{
10433
672
    Py_ssize_t maxlen;
10434
10435
672
    if (!PyUnicode_Check(unicode)) {
10436
0
        PyErr_BadInternalCall();
10437
0
        return -1;
10438
0
    }
10439
672
    if (unicode_check_modifiable(unicode))
10440
0
        return -1;
10441
10442
672
    if (start < 0) {
10443
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10444
0
        return -1;
10445
0
    }
10446
672
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10447
0
        PyErr_SetString(PyExc_ValueError,
10448
0
                         "fill character is bigger than "
10449
0
                         "the string maximum character");
10450
0
        return -1;
10451
0
    }
10452
10453
672
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10454
672
    length = Py_MIN(maxlen, length);
10455
672
    if (length <= 0)
10456
0
        return 0;
10457
10458
672
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10459
672
    return length;
10460
672
}
10461
10462
static PyObject *
10463
pad(PyObject *self,
10464
    Py_ssize_t left,
10465
    Py_ssize_t right,
10466
    Py_UCS4 fill)
10467
0
{
10468
0
    PyObject *u;
10469
0
    Py_UCS4 maxchar;
10470
0
    int kind;
10471
0
    void *data;
10472
10473
0
    if (left < 0)
10474
0
        left = 0;
10475
0
    if (right < 0)
10476
0
        right = 0;
10477
10478
0
    if (left == 0 && right == 0)
10479
0
        return unicode_result_unchanged(self);
10480
10481
0
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10482
0
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10483
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10484
0
        return NULL;
10485
0
    }
10486
0
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10487
0
    maxchar = Py_MAX(maxchar, fill);
10488
0
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10489
0
    if (!u)
10490
0
        return NULL;
10491
10492
0
    kind = PyUnicode_KIND(u);
10493
0
    data = PyUnicode_DATA(u);
10494
0
    if (left)
10495
0
        unicode_fill(kind, data, fill, 0, left);
10496
0
    if (right)
10497
0
        unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10498
0
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10499
0
    assert(_PyUnicode_CheckConsistency(u, 1));
10500
0
    return u;
10501
0
}
10502
10503
PyObject *
10504
PyUnicode_Splitlines(PyObject *string, int keepends)
10505
13.3k
{
10506
13.3k
    PyObject *list;
10507
10508
13.3k
    if (ensure_unicode(string) < 0)
10509
0
        return NULL;
10510
10511
13.3k
    switch (PyUnicode_KIND(string)) {
10512
3.40k
    case PyUnicode_1BYTE_KIND:
10513
3.40k
        if (PyUnicode_IS_ASCII(string))
10514
2.59k
            list = asciilib_splitlines(
10515
2.59k
                string, PyUnicode_1BYTE_DATA(string),
10516
2.59k
                PyUnicode_GET_LENGTH(string), keepends);
10517
815
        else
10518
815
            list = ucs1lib_splitlines(
10519
815
                string, PyUnicode_1BYTE_DATA(string),
10520
815
                PyUnicode_GET_LENGTH(string), keepends);
10521
3.40k
        break;
10522
6.97k
    case PyUnicode_2BYTE_KIND:
10523
6.97k
        list = ucs2lib_splitlines(
10524
6.97k
            string, PyUnicode_2BYTE_DATA(string),
10525
6.97k
            PyUnicode_GET_LENGTH(string), keepends);
10526
6.97k
        break;
10527
2.94k
    case PyUnicode_4BYTE_KIND:
10528
2.94k
        list = ucs4lib_splitlines(
10529
2.94k
            string, PyUnicode_4BYTE_DATA(string),
10530
2.94k
            PyUnicode_GET_LENGTH(string), keepends);
10531
2.94k
        break;
10532
0
    default:
10533
0
        Py_UNREACHABLE();
10534
13.3k
    }
10535
13.3k
    return list;
10536
13.3k
}
10537
10538
static PyObject *
10539
split(PyObject *self,
10540
      PyObject *substring,
10541
      Py_ssize_t maxcount)
10542
19.8M
{
10543
19.8M
    int kind1, kind2;
10544
19.8M
    const void *buf1, *buf2;
10545
19.8M
    Py_ssize_t len1, len2;
10546
19.8M
    PyObject* out;
10547
19.8M
    len1 = PyUnicode_GET_LENGTH(self);
10548
19.8M
    kind1 = PyUnicode_KIND(self);
10549
10550
19.8M
    if (substring == NULL) {
10551
160k
        if (maxcount < 0) {
10552
137k
            maxcount = (len1 - 1) / 2 + 1;
10553
137k
        }
10554
160k
        switch (kind1) {
10555
109k
        case PyUnicode_1BYTE_KIND:
10556
109k
            if (PyUnicode_IS_ASCII(self))
10557
80.4k
                return asciilib_split_whitespace(
10558
80.4k
                    self,  PyUnicode_1BYTE_DATA(self),
10559
80.4k
                    len1, maxcount
10560
80.4k
                    );
10561
28.6k
            else
10562
28.6k
                return ucs1lib_split_whitespace(
10563
28.6k
                    self,  PyUnicode_1BYTE_DATA(self),
10564
28.6k
                    len1, maxcount
10565
28.6k
                    );
10566
40.3k
        case PyUnicode_2BYTE_KIND:
10567
40.3k
            return ucs2lib_split_whitespace(
10568
40.3k
                self,  PyUnicode_2BYTE_DATA(self),
10569
40.3k
                len1, maxcount
10570
40.3k
                );
10571
11.3k
        case PyUnicode_4BYTE_KIND:
10572
11.3k
            return ucs4lib_split_whitespace(
10573
11.3k
                self,  PyUnicode_4BYTE_DATA(self),
10574
11.3k
                len1, maxcount
10575
11.3k
                );
10576
0
        default:
10577
0
            Py_UNREACHABLE();
10578
160k
        }
10579
160k
    }
10580
10581
19.6M
    kind2 = PyUnicode_KIND(substring);
10582
19.6M
    len2 = PyUnicode_GET_LENGTH(substring);
10583
19.6M
    if (maxcount < 0) {
10584
        // if len2 == 0, it will raise ValueError.
10585
10.9M
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10586
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10587
10.9M
        maxcount = maxcount < 0 ? len1 : maxcount;
10588
10.9M
    }
10589
19.6M
    if (kind1 < kind2 || len1 < len2) {
10590
4.92M
        out = PyList_New(1);
10591
4.92M
        if (out == NULL)
10592
0
            return NULL;
10593
4.92M
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10594
4.92M
        return out;
10595
4.92M
    }
10596
14.7M
    buf1 = PyUnicode_DATA(self);
10597
14.7M
    buf2 = PyUnicode_DATA(substring);
10598
14.7M
    if (kind2 != kind1) {
10599
191k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10600
191k
        if (!buf2)
10601
0
            return NULL;
10602
191k
    }
10603
10604
14.7M
    switch (kind1) {
10605
14.5M
    case PyUnicode_1BYTE_KIND:
10606
14.5M
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10607
13.7M
            out = asciilib_split(
10608
13.7M
                self,  buf1, len1, buf2, len2, maxcount);
10609
786k
        else
10610
786k
            out = ucs1lib_split(
10611
786k
                self,  buf1, len1, buf2, len2, maxcount);
10612
14.5M
        break;
10613
156k
    case PyUnicode_2BYTE_KIND:
10614
156k
        out = ucs2lib_split(
10615
156k
            self,  buf1, len1, buf2, len2, maxcount);
10616
156k
        break;
10617
34.6k
    case PyUnicode_4BYTE_KIND:
10618
34.6k
        out = ucs4lib_split(
10619
34.6k
            self,  buf1, len1, buf2, len2, maxcount);
10620
34.6k
        break;
10621
0
    default:
10622
0
        out = NULL;
10623
14.7M
    }
10624
14.7M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10625
14.7M
    if (kind2 != kind1)
10626
191k
        PyMem_Free((void *)buf2);
10627
14.7M
    return out;
10628
14.7M
}
10629
10630
static PyObject *
10631
rsplit(PyObject *self,
10632
       PyObject *substring,
10633
       Py_ssize_t maxcount)
10634
50
{
10635
50
    int kind1, kind2;
10636
50
    const void *buf1, *buf2;
10637
50
    Py_ssize_t len1, len2;
10638
50
    PyObject* out;
10639
10640
50
    len1 = PyUnicode_GET_LENGTH(self);
10641
50
    kind1 = PyUnicode_KIND(self);
10642
10643
50
    if (substring == NULL) {
10644
0
        if (maxcount < 0) {
10645
0
            maxcount = (len1 - 1) / 2 + 1;
10646
0
        }
10647
0
        switch (kind1) {
10648
0
        case PyUnicode_1BYTE_KIND:
10649
0
            if (PyUnicode_IS_ASCII(self))
10650
0
                return asciilib_rsplit_whitespace(
10651
0
                    self,  PyUnicode_1BYTE_DATA(self),
10652
0
                    len1, maxcount
10653
0
                    );
10654
0
            else
10655
0
                return ucs1lib_rsplit_whitespace(
10656
0
                    self,  PyUnicode_1BYTE_DATA(self),
10657
0
                    len1, maxcount
10658
0
                    );
10659
0
        case PyUnicode_2BYTE_KIND:
10660
0
            return ucs2lib_rsplit_whitespace(
10661
0
                self,  PyUnicode_2BYTE_DATA(self),
10662
0
                len1, maxcount
10663
0
                );
10664
0
        case PyUnicode_4BYTE_KIND:
10665
0
            return ucs4lib_rsplit_whitespace(
10666
0
                self,  PyUnicode_4BYTE_DATA(self),
10667
0
                len1, maxcount
10668
0
                );
10669
0
        default:
10670
0
            Py_UNREACHABLE();
10671
0
        }
10672
0
    }
10673
50
    kind2 = PyUnicode_KIND(substring);
10674
50
    len2 = PyUnicode_GET_LENGTH(substring);
10675
50
    if (maxcount < 0) {
10676
        // if len2 == 0, it will raise ValueError.
10677
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10678
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10679
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10680
0
    }
10681
50
    if (kind1 < kind2 || len1 < len2) {
10682
0
        out = PyList_New(1);
10683
0
        if (out == NULL)
10684
0
            return NULL;
10685
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10686
0
        return out;
10687
0
    }
10688
50
    buf1 = PyUnicode_DATA(self);
10689
50
    buf2 = PyUnicode_DATA(substring);
10690
50
    if (kind2 != kind1) {
10691
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10692
0
        if (!buf2)
10693
0
            return NULL;
10694
0
    }
10695
10696
50
    switch (kind1) {
10697
50
    case PyUnicode_1BYTE_KIND:
10698
50
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10699
50
            out = asciilib_rsplit(
10700
50
                self,  buf1, len1, buf2, len2, maxcount);
10701
0
        else
10702
0
            out = ucs1lib_rsplit(
10703
0
                self,  buf1, len1, buf2, len2, maxcount);
10704
50
        break;
10705
0
    case PyUnicode_2BYTE_KIND:
10706
0
        out = ucs2lib_rsplit(
10707
0
            self,  buf1, len1, buf2, len2, maxcount);
10708
0
        break;
10709
0
    case PyUnicode_4BYTE_KIND:
10710
0
        out = ucs4lib_rsplit(
10711
0
            self,  buf1, len1, buf2, len2, maxcount);
10712
0
        break;
10713
0
    default:
10714
0
        out = NULL;
10715
50
    }
10716
50
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10717
50
    if (kind2 != kind1)
10718
0
        PyMem_Free((void *)buf2);
10719
50
    return out;
10720
50
}
10721
10722
static Py_ssize_t
10723
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10724
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10725
144M
{
10726
144M
    switch (kind) {
10727
18.8M
    case PyUnicode_1BYTE_KIND:
10728
18.8M
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10729
16.1M
            return asciilib_find(buf1, len1, buf2, len2, offset);
10730
2.63M
        else
10731
2.63M
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10732
50.5M
    case PyUnicode_2BYTE_KIND:
10733
50.5M
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10734
75.2M
    case PyUnicode_4BYTE_KIND:
10735
75.2M
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10736
144M
    }
10737
144M
    Py_UNREACHABLE();
10738
144M
}
10739
10740
static Py_ssize_t
10741
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10742
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10743
42.1M
{
10744
42.1M
    switch (kind) {
10745
37.7M
    case PyUnicode_1BYTE_KIND:
10746
37.7M
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10747
4.25M
    case PyUnicode_2BYTE_KIND:
10748
4.25M
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10749
128k
    case PyUnicode_4BYTE_KIND:
10750
128k
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10751
42.1M
    }
10752
42.1M
    Py_UNREACHABLE();
10753
42.1M
}
10754
10755
static void
10756
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10757
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10758
1.05M
{
10759
1.05M
    int kind = PyUnicode_KIND(u);
10760
1.05M
    void *data = PyUnicode_DATA(u);
10761
1.05M
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10762
1.05M
    if (kind == PyUnicode_1BYTE_KIND) {
10763
372k
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10764
372k
                                      (Py_UCS1 *)data + len,
10765
372k
                                      u1, u2, maxcount);
10766
372k
    }
10767
685k
    else if (kind == PyUnicode_2BYTE_KIND) {
10768
672k
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10769
672k
                                      (Py_UCS2 *)data + len,
10770
672k
                                      u1, u2, maxcount);
10771
672k
    }
10772
12.5k
    else {
10773
12.5k
        assert(kind == PyUnicode_4BYTE_KIND);
10774
12.5k
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10775
12.5k
                                      (Py_UCS4 *)data + len,
10776
12.5k
                                      u1, u2, maxcount);
10777
12.5k
    }
10778
1.05M
}
10779
10780
static PyObject *
10781
replace(PyObject *self, PyObject *str1,
10782
        PyObject *str2, Py_ssize_t maxcount)
10783
77.0M
{
10784
77.0M
    PyObject *u;
10785
77.0M
    const char *sbuf = PyUnicode_DATA(self);
10786
77.0M
    const void *buf1 = PyUnicode_DATA(str1);
10787
77.0M
    const void *buf2 = PyUnicode_DATA(str2);
10788
77.0M
    int srelease = 0, release1 = 0, release2 = 0;
10789
77.0M
    int skind = PyUnicode_KIND(self);
10790
77.0M
    int kind1 = PyUnicode_KIND(str1);
10791
77.0M
    int kind2 = PyUnicode_KIND(str2);
10792
77.0M
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10793
77.0M
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10794
77.0M
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10795
77.0M
    int mayshrink;
10796
77.0M
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10797
10798
77.0M
    if (slen < len1)
10799
30.1M
        goto nothing;
10800
10801
46.9M
    if (maxcount < 0)
10802
46.9M
        maxcount = PY_SSIZE_T_MAX;
10803
0
    else if (maxcount == 0)
10804
0
        goto nothing;
10805
10806
46.9M
    if (str1 == str2)
10807
0
        goto nothing;
10808
10809
46.9M
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10810
46.9M
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10811
46.9M
    if (maxchar < maxchar_str1)
10812
        /* substring too wide to be present */
10813
0
        goto nothing;
10814
46.9M
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10815
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10816
       result string. */
10817
46.9M
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10818
46.9M
    maxchar = Py_MAX(maxchar, maxchar_str2);
10819
10820
46.9M
    if (len1 == len2) {
10821
        /* same length */
10822
4.82M
        if (len1 == 0)
10823
0
            goto nothing;
10824
4.82M
        if (len1 == 1) {
10825
            /* replace characters */
10826
4.82M
            Py_UCS4 u1, u2;
10827
4.82M
            Py_ssize_t pos;
10828
10829
4.82M
            u1 = PyUnicode_READ(kind1, buf1, 0);
10830
4.82M
            pos = findchar(sbuf, skind, slen, u1, 1);
10831
4.82M
            if (pos < 0)
10832
3.76M
                goto nothing;
10833
1.05M
            u2 = PyUnicode_READ(kind2, buf2, 0);
10834
1.05M
            u = PyUnicode_New(slen, maxchar);
10835
1.05M
            if (!u)
10836
0
                goto error;
10837
10838
1.05M
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10839
1.05M
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10840
1.05M
        }
10841
0
        else {
10842
0
            int rkind = skind;
10843
0
            char *res;
10844
0
            Py_ssize_t i;
10845
10846
0
            if (kind1 < rkind) {
10847
                /* widen substring */
10848
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10849
0
                if (!buf1) goto error;
10850
0
                release1 = 1;
10851
0
            }
10852
0
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10853
0
            if (i < 0)
10854
0
                goto nothing;
10855
0
            if (rkind > kind2) {
10856
                /* widen replacement */
10857
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10858
0
                if (!buf2) goto error;
10859
0
                release2 = 1;
10860
0
            }
10861
0
            else if (rkind < kind2) {
10862
                /* widen self and buf1 */
10863
0
                rkind = kind2;
10864
0
                if (release1) {
10865
0
                    assert(buf1 != PyUnicode_DATA(str1));
10866
0
                    PyMem_Free((void *)buf1);
10867
0
                    buf1 = PyUnicode_DATA(str1);
10868
0
                    release1 = 0;
10869
0
                }
10870
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10871
0
                if (!sbuf) goto error;
10872
0
                srelease = 1;
10873
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10874
0
                if (!buf1) goto error;
10875
0
                release1 = 1;
10876
0
            }
10877
0
            u = PyUnicode_New(slen, maxchar);
10878
0
            if (!u)
10879
0
                goto error;
10880
0
            assert(PyUnicode_KIND(u) == rkind);
10881
0
            res = PyUnicode_DATA(u);
10882
10883
0
            memcpy(res, sbuf, rkind * slen);
10884
            /* change everything in-place, starting with this one */
10885
0
            memcpy(res + rkind * i,
10886
0
                   buf2,
10887
0
                   rkind * len2);
10888
0
            i += len1;
10889
10890
0
            while ( --maxcount > 0) {
10891
0
                i = anylib_find(rkind, self,
10892
0
                                sbuf+rkind*i, slen-i,
10893
0
                                str1, buf1, len1, i);
10894
0
                if (i == -1)
10895
0
                    break;
10896
0
                memcpy(res + rkind * i,
10897
0
                       buf2,
10898
0
                       rkind * len2);
10899
0
                i += len1;
10900
0
            }
10901
0
        }
10902
4.82M
    }
10903
42.1M
    else {
10904
42.1M
        Py_ssize_t n, i, j, ires;
10905
42.1M
        Py_ssize_t new_size;
10906
42.1M
        int rkind = skind;
10907
42.1M
        char *res;
10908
10909
42.1M
        if (kind1 < rkind) {
10910
            /* widen substring */
10911
4.38M
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10912
4.38M
            if (!buf1) goto error;
10913
4.38M
            release1 = 1;
10914
4.38M
        }
10915
42.1M
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10916
42.1M
        if (n == 0)
10917
36.9M
            goto nothing;
10918
5.18M
        if (kind2 < rkind) {
10919
            /* widen replacement */
10920
926k
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10921
926k
            if (!buf2) goto error;
10922
926k
            release2 = 1;
10923
926k
        }
10924
4.25M
        else if (kind2 > rkind) {
10925
            /* widen self and buf1 */
10926
0
            rkind = kind2;
10927
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10928
0
            if (!sbuf) goto error;
10929
0
            srelease = 1;
10930
0
            if (release1) {
10931
0
                assert(buf1 != PyUnicode_DATA(str1));
10932
0
                PyMem_Free((void *)buf1);
10933
0
                buf1 = PyUnicode_DATA(str1);
10934
0
                release1 = 0;
10935
0
            }
10936
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10937
0
            if (!buf1) goto error;
10938
0
            release1 = 1;
10939
0
        }
10940
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10941
           PyUnicode_GET_LENGTH(str1)); */
10942
5.18M
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10943
0
                PyErr_SetString(PyExc_OverflowError,
10944
0
                                "replace string is too long");
10945
0
                goto error;
10946
0
        }
10947
5.18M
        new_size = slen + n * (len2 - len1);
10948
5.18M
        if (new_size == 0) {
10949
0
            u = unicode_get_empty();
10950
0
            goto done;
10951
0
        }
10952
5.18M
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10953
0
            PyErr_SetString(PyExc_OverflowError,
10954
0
                            "replace string is too long");
10955
0
            goto error;
10956
0
        }
10957
5.18M
        u = PyUnicode_New(new_size, maxchar);
10958
5.18M
        if (!u)
10959
0
            goto error;
10960
5.18M
        assert(PyUnicode_KIND(u) == rkind);
10961
5.18M
        res = PyUnicode_DATA(u);
10962
5.18M
        ires = i = 0;
10963
5.18M
        if (len1 > 0) {
10964
149M
            while (n-- > 0) {
10965
                /* look for next match */
10966
144M
                j = anylib_find(rkind, self,
10967
144M
                                sbuf + rkind * i, slen-i,
10968
144M
                                str1, buf1, len1, i);
10969
144M
                if (j == -1)
10970
0
                    break;
10971
144M
                else if (j > i) {
10972
                    /* copy unchanged part [i:j] */
10973
20.8M
                    memcpy(res + rkind * ires,
10974
20.8M
                           sbuf + rkind * i,
10975
20.8M
                           rkind * (j-i));
10976
20.8M
                    ires += j - i;
10977
20.8M
                }
10978
                /* copy substitution string */
10979
144M
                if (len2 > 0) {
10980
144M
                    memcpy(res + rkind * ires,
10981
144M
                           buf2,
10982
144M
                           rkind * len2);
10983
144M
                    ires += len2;
10984
144M
                }
10985
144M
                i = j + len1;
10986
144M
            }
10987
5.18M
            if (i < slen)
10988
                /* copy tail [i:] */
10989
5.11M
                memcpy(res + rkind * ires,
10990
5.11M
                       sbuf + rkind * i,
10991
5.11M
                       rkind * (slen-i));
10992
5.18M
        }
10993
0
        else {
10994
            /* interleave */
10995
0
            while (n > 0) {
10996
0
                memcpy(res + rkind * ires,
10997
0
                       buf2,
10998
0
                       rkind * len2);
10999
0
                ires += len2;
11000
0
                if (--n <= 0)
11001
0
                    break;
11002
0
                memcpy(res + rkind * ires,
11003
0
                       sbuf + rkind * i,
11004
0
                       rkind);
11005
0
                ires++;
11006
0
                i++;
11007
0
            }
11008
0
            memcpy(res + rkind * ires,
11009
0
                   sbuf + rkind * i,
11010
0
                   rkind * (slen-i));
11011
0
        }
11012
5.18M
    }
11013
11014
6.24M
    if (mayshrink) {
11015
0
        unicode_adjust_maxchar(&u);
11016
0
        if (u == NULL)
11017
0
            goto error;
11018
0
    }
11019
11020
6.24M
  done:
11021
6.24M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
11022
6.24M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11023
6.24M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11024
6.24M
    if (srelease)
11025
0
        PyMem_Free((void *)sbuf);
11026
6.24M
    if (release1)
11027
926k
        PyMem_Free((void *)buf1);
11028
6.24M
    if (release2)
11029
926k
        PyMem_Free((void *)buf2);
11030
6.24M
    assert(_PyUnicode_CheckConsistency(u, 1));
11031
6.24M
    return u;
11032
11033
70.8M
  nothing:
11034
    /* nothing to replace; return original string (when possible) */
11035
70.8M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
11036
70.8M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11037
70.8M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11038
70.8M
    if (srelease)
11039
0
        PyMem_Free((void *)sbuf);
11040
70.8M
    if (release1)
11041
3.45M
        PyMem_Free((void *)buf1);
11042
70.8M
    if (release2)
11043
0
        PyMem_Free((void *)buf2);
11044
70.8M
    return unicode_result_unchanged(self);
11045
11046
0
  error:
11047
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
11048
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11049
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11050
0
    if (srelease)
11051
0
        PyMem_Free((void *)sbuf);
11052
0
    if (release1)
11053
0
        PyMem_Free((void *)buf1);
11054
0
    if (release2)
11055
0
        PyMem_Free((void *)buf2);
11056
0
    return NULL;
11057
6.24M
}
11058
11059
/* --- Unicode Object Methods --------------------------------------------- */
11060
11061
/*[clinic input]
11062
str.title as unicode_title
11063
11064
Return a version of the string where each word is titlecased.
11065
11066
More specifically, words start with uppercased characters and all remaining
11067
cased characters have lower case.
11068
[clinic start generated code]*/
11069
11070
static PyObject *
11071
unicode_title_impl(PyObject *self)
11072
/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
11073
0
{
11074
0
    return case_operation(self, do_title);
11075
0
}
11076
11077
/*[clinic input]
11078
str.capitalize as unicode_capitalize
11079
11080
Return a capitalized version of the string.
11081
11082
More specifically, make the first character have upper case and the rest lower
11083
case.
11084
[clinic start generated code]*/
11085
11086
static PyObject *
11087
unicode_capitalize_impl(PyObject *self)
11088
/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
11089
0
{
11090
0
    if (PyUnicode_GET_LENGTH(self) == 0)
11091
0
        return unicode_result_unchanged(self);
11092
0
    return case_operation(self, do_capitalize);
11093
0
}
11094
11095
/*[clinic input]
11096
str.casefold as unicode_casefold
11097
11098
Return a version of the string suitable for caseless comparisons.
11099
[clinic start generated code]*/
11100
11101
static PyObject *
11102
unicode_casefold_impl(PyObject *self)
11103
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
11104
0
{
11105
0
    if (PyUnicode_IS_ASCII(self))
11106
0
        return ascii_upper_or_lower(self, 1);
11107
0
    return case_operation(self, do_casefold);
11108
0
}
11109
11110
11111
/* Argument converter. Accepts a single Unicode character. */
11112
11113
static int
11114
convert_uc(PyObject *obj, void *addr)
11115
0
{
11116
0
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
11117
11118
0
    if (!PyUnicode_Check(obj)) {
11119
0
        PyErr_Format(PyExc_TypeError,
11120
0
                     "The fill character must be a unicode character, "
11121
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
11122
0
        return 0;
11123
0
    }
11124
0
    if (PyUnicode_GET_LENGTH(obj) != 1) {
11125
0
        PyErr_SetString(PyExc_TypeError,
11126
0
                        "The fill character must be exactly one character long");
11127
0
        return 0;
11128
0
    }
11129
0
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
11130
0
    return 1;
11131
0
}
11132
11133
/*[clinic input]
11134
str.center as unicode_center
11135
11136
    width: Py_ssize_t
11137
    fillchar: Py_UCS4 = ' '
11138
    /
11139
11140
Return a centered string of length width.
11141
11142
Padding is done using the specified fill character (default is a space).
11143
[clinic start generated code]*/
11144
11145
static PyObject *
11146
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11147
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
11148
0
{
11149
0
    Py_ssize_t marg, left;
11150
11151
0
    if (PyUnicode_GET_LENGTH(self) >= width)
11152
0
        return unicode_result_unchanged(self);
11153
11154
0
    marg = width - PyUnicode_GET_LENGTH(self);
11155
0
    left = marg / 2 + (marg & width & 1);
11156
11157
0
    return pad(self, left, marg - left, fillchar);
11158
0
}
11159
11160
/* This function assumes that str1 and str2 are readied by the caller. */
11161
11162
static int
11163
unicode_compare(PyObject *str1, PyObject *str2)
11164
16.2M
{
11165
16.2M
#define COMPARE(TYPE1, TYPE2) \
11166
16.2M
    do { \
11167
15.4M
        TYPE1* p1 = (TYPE1 *)data1; \
11168
15.4M
        TYPE2* p2 = (TYPE2 *)data2; \
11169
15.4M
        TYPE1* end = p1 + len; \
11170
15.4M
        Py_UCS4 c1, c2; \
11171
15.4M
        for (; p1 != end; p1++, p2++) { \
11172
15.4M
            c1 = *p1; \
11173
15.4M
            c2 = *p2; \
11174
15.4M
            if (c1 != c2) \
11175
15.4M
                return (c1 < c2) ? -1 : 1; \
11176
15.4M
        } \
11177
15.4M
    } \
11178
15.4M
    while (0)
11179
11180
16.2M
    int kind1, kind2;
11181
16.2M
    const void *data1, *data2;
11182
16.2M
    Py_ssize_t len1, len2, len;
11183
11184
16.2M
    kind1 = PyUnicode_KIND(str1);
11185
16.2M
    kind2 = PyUnicode_KIND(str2);
11186
16.2M
    data1 = PyUnicode_DATA(str1);
11187
16.2M
    data2 = PyUnicode_DATA(str2);
11188
16.2M
    len1 = PyUnicode_GET_LENGTH(str1);
11189
16.2M
    len2 = PyUnicode_GET_LENGTH(str2);
11190
16.2M
    len = Py_MIN(len1, len2);
11191
11192
16.2M
    switch(kind1) {
11193
1.17M
    case PyUnicode_1BYTE_KIND:
11194
1.17M
    {
11195
1.17M
        switch(kind2) {
11196
58.9k
        case PyUnicode_1BYTE_KIND:
11197
58.9k
        {
11198
58.9k
            int cmp = memcmp(data1, data2, len);
11199
            /* normalize result of memcmp() into the range [-1; 1] */
11200
58.9k
            if (cmp < 0)
11201
36.8k
                return -1;
11202
22.1k
            if (cmp > 0)
11203
21.6k
                return 1;
11204
506
            break;
11205
22.1k
        }
11206
887k
        case PyUnicode_2BYTE_KIND:
11207
887k
            COMPARE(Py_UCS1, Py_UCS2);
11208
0
            break;
11209
227k
        case PyUnicode_4BYTE_KIND:
11210
227k
            COMPARE(Py_UCS1, Py_UCS4);
11211
0
            break;
11212
0
        default:
11213
0
            Py_UNREACHABLE();
11214
1.17M
        }
11215
506
        break;
11216
1.17M
    }
11217
14.0M
    case PyUnicode_2BYTE_KIND:
11218
14.0M
    {
11219
14.0M
        switch(kind2) {
11220
2.51k
        case PyUnicode_1BYTE_KIND:
11221
2.51k
            COMPARE(Py_UCS2, Py_UCS1);
11222
0
            break;
11223
12.6M
        case PyUnicode_2BYTE_KIND:
11224
12.6M
        {
11225
12.6M
            COMPARE(Py_UCS2, Py_UCS2);
11226
0
            break;
11227
12.6M
        }
11228
1.40M
        case PyUnicode_4BYTE_KIND:
11229
1.40M
            COMPARE(Py_UCS2, Py_UCS4);
11230
0
            break;
11231
0
        default:
11232
0
            Py_UNREACHABLE();
11233
14.0M
        }
11234
0
        break;
11235
14.0M
    }
11236
1.07M
    case PyUnicode_4BYTE_KIND:
11237
1.07M
    {
11238
1.07M
        switch(kind2) {
11239
897
        case PyUnicode_1BYTE_KIND:
11240
897
            COMPARE(Py_UCS4, Py_UCS1);
11241
0
            break;
11242
341k
        case PyUnicode_2BYTE_KIND:
11243
341k
            COMPARE(Py_UCS4, Py_UCS2);
11244
0
            break;
11245
733k
        case PyUnicode_4BYTE_KIND:
11246
733k
        {
11247
733k
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11248
733k
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11249
            /* normalize result of wmemcmp() into the range [-1; 1] */
11250
733k
            if (cmp < 0)
11251
354k
                return -1;
11252
378k
            if (cmp > 0)
11253
378k
                return 1;
11254
#else
11255
            COMPARE(Py_UCS4, Py_UCS4);
11256
#endif
11257
0
            break;
11258
378k
        }
11259
0
        default:
11260
0
            Py_UNREACHABLE();
11261
1.07M
        }
11262
0
        break;
11263
1.07M
    }
11264
0
    default:
11265
0
        Py_UNREACHABLE();
11266
16.2M
    }
11267
11268
506
    if (len1 == len2)
11269
504
        return 0;
11270
2
    if (len1 < len2)
11271
2
        return -1;
11272
0
    else
11273
0
        return 1;
11274
11275
2
#undef COMPARE
11276
2
}
11277
11278
11279
int
11280
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11281
255M
{
11282
255M
    assert(PyUnicode_Check(str1));
11283
255M
    assert(PyUnicode_Check(str2));
11284
255M
    if (str1 == str2) {
11285
68.8M
        return 1;
11286
68.8M
    }
11287
187M
    return unicode_eq(str1, str2);
11288
255M
}
11289
11290
11291
int
11292
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11293
0
{
11294
0
    if (!PyUnicode_Check(str1)) {
11295
0
        PyErr_Format(PyExc_TypeError,
11296
0
                     "first argument must be str, not %T", str1);
11297
0
        return -1;
11298
0
    }
11299
0
    if (!PyUnicode_Check(str2)) {
11300
0
        PyErr_Format(PyExc_TypeError,
11301
0
                     "second argument must be str, not %T", str2);
11302
0
        return -1;
11303
0
    }
11304
11305
0
    return _PyUnicode_Equal(str1, str2);
11306
0
}
11307
11308
11309
int
11310
PyUnicode_Compare(PyObject *left, PyObject *right)
11311
6.08k
{
11312
6.08k
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11313
        /* a string is equal to itself */
11314
6.08k
        if (left == right)
11315
0
            return 0;
11316
11317
6.08k
        return unicode_compare(left, right);
11318
6.08k
    }
11319
0
    PyErr_Format(PyExc_TypeError,
11320
0
                 "Can't compare %.100s and %.100s",
11321
0
                 Py_TYPE(left)->tp_name,
11322
0
                 Py_TYPE(right)->tp_name);
11323
0
    return -1;
11324
6.08k
}
11325
11326
int
11327
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11328
1.69M
{
11329
1.69M
    Py_ssize_t i;
11330
1.69M
    int kind;
11331
1.69M
    Py_UCS4 chr;
11332
11333
1.69M
    assert(_PyUnicode_CHECK(uni));
11334
1.69M
    kind = PyUnicode_KIND(uni);
11335
1.69M
    if (kind == PyUnicode_1BYTE_KIND) {
11336
1.69M
        const void *data = PyUnicode_1BYTE_DATA(uni);
11337
1.69M
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11338
1.69M
        size_t len, len2 = strlen(str);
11339
1.69M
        int cmp;
11340
11341
1.69M
        len = Py_MIN(len1, len2);
11342
1.69M
        cmp = memcmp(data, str, len);
11343
1.69M
        if (cmp != 0) {
11344
1.23M
            if (cmp < 0)
11345
8.13k
                return -1;
11346
1.22M
            else
11347
1.22M
                return 1;
11348
1.23M
        }
11349
460k
        if (len1 > len2)
11350
198
            return 1; /* uni is longer */
11351
460k
        if (len1 < len2)
11352
600
            return -1; /* str is longer */
11353
460k
        return 0;
11354
460k
    }
11355
1.21k
    else {
11356
1.21k
        const void *data = PyUnicode_DATA(uni);
11357
        /* Compare Unicode string and source character set string */
11358
2.52k
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11359
2.28k
            if (chr != (unsigned char)str[i])
11360
981
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11361
        /* This check keeps Python strings that end in '\0' from comparing equal
11362
         to C strings identical up to that point. */
11363
235
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11364
235
            return 1; /* uni is longer */
11365
0
        if (str[i])
11366
0
            return -1; /* str is longer */
11367
0
        return 0;
11368
0
    }
11369
1.69M
}
11370
11371
int
11372
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11373
0
{
11374
0
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11375
0
}
11376
11377
int
11378
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11379
0
{
11380
0
    assert(_PyUnicode_CHECK(unicode));
11381
0
    assert(str);
11382
11383
0
    if (PyUnicode_IS_ASCII(unicode)) {
11384
0
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11385
0
        return size == len &&
11386
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11387
0
    }
11388
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11389
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11390
0
        return size == len &&
11391
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11392
0
    }
11393
11394
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11395
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11396
0
        return 0;
11397
0
    }
11398
0
    const unsigned char *s = (const unsigned char *)str;
11399
0
    const unsigned char *ends = s + (size_t)size;
11400
0
    int kind = PyUnicode_KIND(unicode);
11401
0
    const void *data = PyUnicode_DATA(unicode);
11402
    /* Compare Unicode string and UTF-8 string */
11403
0
    for (Py_ssize_t i = 0; i < len; i++) {
11404
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11405
0
        if (ch < 0x80) {
11406
0
            if (ends == s || s[0] != ch) {
11407
0
                return 0;
11408
0
            }
11409
0
            s += 1;
11410
0
        }
11411
0
        else if (ch < 0x800) {
11412
0
            if ((ends - s) < 2 ||
11413
0
                s[0] != (0xc0 | (ch >> 6)) ||
11414
0
                s[1] != (0x80 | (ch & 0x3f)))
11415
0
            {
11416
0
                return 0;
11417
0
            }
11418
0
            s += 2;
11419
0
        }
11420
0
        else if (ch < 0x10000) {
11421
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11422
0
                (ends - s) < 3 ||
11423
0
                s[0] != (0xe0 | (ch >> 12)) ||
11424
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11425
0
                s[2] != (0x80 | (ch & 0x3f)))
11426
0
            {
11427
0
                return 0;
11428
0
            }
11429
0
            s += 3;
11430
0
        }
11431
0
        else {
11432
0
            assert(ch <= MAX_UNICODE);
11433
0
            if ((ends - s) < 4 ||
11434
0
                s[0] != (0xf0 | (ch >> 18)) ||
11435
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11436
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11437
0
                s[3] != (0x80 | (ch & 0x3f)))
11438
0
            {
11439
0
                return 0;
11440
0
            }
11441
0
            s += 4;
11442
0
        }
11443
0
    }
11444
0
    return s == ends;
11445
0
}
11446
11447
int
11448
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11449
7.52M
{
11450
7.52M
    size_t len;
11451
7.52M
    assert(_PyUnicode_CHECK(unicode));
11452
7.52M
    assert(str);
11453
#ifndef NDEBUG
11454
    for (const char *p = str; *p; p++) {
11455
        assert((unsigned char)*p < 128);
11456
    }
11457
#endif
11458
7.52M
    if (!PyUnicode_IS_ASCII(unicode))
11459
153k
        return 0;
11460
7.37M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11461
7.37M
    return strlen(str) == len &&
11462
7.37M
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11463
7.52M
}
11464
11465
int
11466
_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11467
0
{
11468
0
    PyObject *right_uni;
11469
11470
0
    assert(_PyUnicode_CHECK(left));
11471
0
    assert(right->string);
11472
#ifndef NDEBUG
11473
    for (const char *p = right->string; *p; p++) {
11474
        assert((unsigned char)*p < 128);
11475
    }
11476
#endif
11477
11478
0
    if (!PyUnicode_IS_ASCII(left))
11479
0
        return 0;
11480
11481
0
    right_uni = _PyUnicode_FromId(right);       /* borrowed */
11482
0
    if (right_uni == NULL) {
11483
        /* memory error or bad data */
11484
0
        PyErr_Clear();
11485
0
        return _PyUnicode_EqualToASCIIString(left, right->string);
11486
0
    }
11487
11488
0
    if (left == right_uni)
11489
0
        return 1;
11490
11491
0
    assert(PyUnicode_CHECK_INTERNED(right_uni));
11492
0
    if (PyUnicode_CHECK_INTERNED(left)) {
11493
0
        return 0;
11494
0
    }
11495
11496
0
    Py_hash_t right_hash = PyUnicode_HASH(right_uni);
11497
0
    assert(right_hash != -1);
11498
0
    Py_hash_t hash = PyUnicode_HASH(left);
11499
0
    if (hash != -1 && hash != right_hash) {
11500
0
        return 0;
11501
0
    }
11502
11503
0
    return unicode_eq(left, right_uni);
11504
0
}
11505
11506
PyObject *
11507
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11508
21.9M
{
11509
21.9M
    int result;
11510
11511
21.9M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11512
87.8k
        Py_RETURN_NOTIMPLEMENTED;
11513
11514
21.8M
    if (left == right) {
11515
1.20k
        switch (op) {
11516
1.13k
        case Py_EQ:
11517
1.13k
        case Py_LE:
11518
1.13k
        case Py_GE:
11519
            /* a string is equal to itself */
11520
1.13k
            Py_RETURN_TRUE;
11521
66
        case Py_NE:
11522
66
        case Py_LT:
11523
66
        case Py_GT:
11524
66
            Py_RETURN_FALSE;
11525
0
        default:
11526
0
            PyErr_BadArgument();
11527
0
            return NULL;
11528
1.20k
        }
11529
1.20k
    }
11530
21.8M
    else if (op == Py_EQ || op == Py_NE) {
11531
5.54M
        result = unicode_eq(left, right);
11532
5.54M
        result ^= (op == Py_NE);
11533
5.54M
        return PyBool_FromLong(result);
11534
5.54M
    }
11535
16.2M
    else {
11536
16.2M
        result = unicode_compare(left, right);
11537
16.2M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11538
16.2M
    }
11539
21.8M
}
11540
11541
int
11542
PyUnicode_Contains(PyObject *str, PyObject *substr)
11543
93.0M
{
11544
93.0M
    int kind1, kind2;
11545
93.0M
    const void *buf1, *buf2;
11546
93.0M
    Py_ssize_t len1, len2;
11547
93.0M
    int result;
11548
11549
93.0M
    if (!PyUnicode_Check(substr)) {
11550
0
        PyErr_Format(PyExc_TypeError,
11551
0
                     "'in <string>' requires string as left operand, not %.100s",
11552
0
                     Py_TYPE(substr)->tp_name);
11553
0
        return -1;
11554
0
    }
11555
93.0M
    if (ensure_unicode(str) < 0)
11556
0
        return -1;
11557
11558
93.0M
    kind1 = PyUnicode_KIND(str);
11559
93.0M
    kind2 = PyUnicode_KIND(substr);
11560
93.0M
    if (kind1 < kind2)
11561
4.05M
        return 0;
11562
88.9M
    len1 = PyUnicode_GET_LENGTH(str);
11563
88.9M
    len2 = PyUnicode_GET_LENGTH(substr);
11564
88.9M
    if (len1 < len2)
11565
4.93M
        return 0;
11566
84.0M
    buf1 = PyUnicode_DATA(str);
11567
84.0M
    buf2 = PyUnicode_DATA(substr);
11568
84.0M
    if (len2 == 1) {
11569
83.9M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11570
83.9M
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11571
83.9M
        return result;
11572
83.9M
    }
11573
36.5k
    if (kind2 != kind1) {
11574
18.0k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11575
18.0k
        if (!buf2)
11576
0
            return -1;
11577
18.0k
    }
11578
11579
36.5k
    switch (kind1) {
11580
18.4k
    case PyUnicode_1BYTE_KIND:
11581
18.4k
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11582
18.4k
        break;
11583
14.1k
    case PyUnicode_2BYTE_KIND:
11584
14.1k
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11585
14.1k
        break;
11586
3.94k
    case PyUnicode_4BYTE_KIND:
11587
3.94k
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11588
3.94k
        break;
11589
0
    default:
11590
0
        Py_UNREACHABLE();
11591
36.5k
    }
11592
11593
36.5k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11594
36.5k
    if (kind2 != kind1)
11595
18.0k
        PyMem_Free((void *)buf2);
11596
11597
36.5k
    return result;
11598
36.5k
}
11599
11600
/* Concat to string or Unicode object giving a new Unicode object. */
11601
11602
PyObject *
11603
PyUnicode_Concat(PyObject *left, PyObject *right)
11604
32.7M
{
11605
32.7M
    PyObject *result;
11606
32.7M
    Py_UCS4 maxchar, maxchar2;
11607
32.7M
    Py_ssize_t left_len, right_len, new_len;
11608
11609
32.7M
    if (ensure_unicode(left) < 0)
11610
0
        return NULL;
11611
11612
32.7M
    if (!PyUnicode_Check(right)) {
11613
0
        if (_PyTemplate_CheckExact(right)) {
11614
            // str + tstring is implemented in the tstring type
11615
0
            return _PyTemplate_Concat(left, right);
11616
0
        }
11617
0
        else {
11618
0
            PyErr_Format(PyExc_TypeError,
11619
0
                "can only concatenate str (not \"%.200s\") to str",
11620
0
                Py_TYPE(right)->tp_name);
11621
0
            return NULL;
11622
0
        }
11623
0
    }
11624
11625
    /* Shortcuts */
11626
32.7M
    PyObject *empty = unicode_get_empty();  // Borrowed reference
11627
32.7M
    if (left == empty) {
11628
67.5k
        return PyUnicode_FromObject(right);
11629
67.5k
    }
11630
32.7M
    if (right == empty) {
11631
8.95M
        return PyUnicode_FromObject(left);
11632
8.95M
    }
11633
11634
23.7M
    left_len = PyUnicode_GET_LENGTH(left);
11635
23.7M
    right_len = PyUnicode_GET_LENGTH(right);
11636
23.7M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11637
0
        PyErr_SetString(PyExc_OverflowError,
11638
0
                        "strings are too large to concat");
11639
0
        return NULL;
11640
0
    }
11641
23.7M
    new_len = left_len + right_len;
11642
11643
23.7M
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11644
23.7M
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11645
23.7M
    maxchar = Py_MAX(maxchar, maxchar2);
11646
11647
    /* Concat the two Unicode strings */
11648
23.7M
    result = PyUnicode_New(new_len, maxchar);
11649
23.7M
    if (result == NULL)
11650
0
        return NULL;
11651
23.7M
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11652
23.7M
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11653
23.7M
    assert(_PyUnicode_CheckConsistency(result, 1));
11654
23.7M
    return result;
11655
23.7M
}
11656
11657
void
11658
PyUnicode_Append(PyObject **p_left, PyObject *right)
11659
1.06M
{
11660
1.06M
    PyObject *left, *res;
11661
1.06M
    Py_UCS4 maxchar, maxchar2;
11662
1.06M
    Py_ssize_t left_len, right_len, new_len;
11663
11664
1.06M
    if (p_left == NULL) {
11665
0
        if (!PyErr_Occurred())
11666
0
            PyErr_BadInternalCall();
11667
0
        return;
11668
0
    }
11669
1.06M
    left = *p_left;
11670
1.06M
    if (right == NULL || left == NULL
11671
1.06M
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11672
0
        if (!PyErr_Occurred())
11673
0
            PyErr_BadInternalCall();
11674
0
        goto error;
11675
0
    }
11676
11677
    /* Shortcuts */
11678
1.06M
    PyObject *empty = unicode_get_empty();  // Borrowed reference
11679
1.06M
    if (left == empty) {
11680
360k
        Py_DECREF(left);
11681
360k
        *p_left = Py_NewRef(right);
11682
360k
        return;
11683
360k
    }
11684
701k
    if (right == empty) {
11685
0
        return;
11686
0
    }
11687
11688
701k
    left_len = PyUnicode_GET_LENGTH(left);
11689
701k
    right_len = PyUnicode_GET_LENGTH(right);
11690
701k
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11691
0
        PyErr_SetString(PyExc_OverflowError,
11692
0
                        "strings are too large to concat");
11693
0
        goto error;
11694
0
    }
11695
701k
    new_len = left_len + right_len;
11696
11697
701k
    if (unicode_modifiable(left)
11698
701k
        && PyUnicode_CheckExact(right)
11699
701k
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11700
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11701
           to change the structure size, but characters are stored just after
11702
           the structure, and so it requires to move all characters which is
11703
           not so different than duplicating the string. */
11704
701k
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11705
652k
    {
11706
        /* append inplace */
11707
652k
        if (unicode_resize(p_left, new_len) != 0)
11708
0
            goto error;
11709
11710
        /* copy 'right' into the newly allocated area of 'left' */
11711
652k
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11712
652k
    }
11713
49.0k
    else {
11714
49.0k
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11715
49.0k
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11716
49.0k
        maxchar = Py_MAX(maxchar, maxchar2);
11717
11718
        /* Concat the two Unicode strings */
11719
49.0k
        res = PyUnicode_New(new_len, maxchar);
11720
49.0k
        if (res == NULL)
11721
0
            goto error;
11722
49.0k
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11723
49.0k
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11724
49.0k
        Py_DECREF(left);
11725
49.0k
        *p_left = res;
11726
49.0k
    }
11727
701k
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11728
701k
    return;
11729
11730
0
error:
11731
0
    Py_CLEAR(*p_left);
11732
0
}
11733
11734
void
11735
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11736
0
{
11737
0
    PyUnicode_Append(pleft, right);
11738
0
    Py_XDECREF(right);
11739
0
}
11740
11741
/*[clinic input]
11742
@text_signature "($self, sub[, start[, end]], /)"
11743
str.count as unicode_count -> Py_ssize_t
11744
11745
    self as str: self
11746
    sub as substr: unicode
11747
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11748
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11749
    /
11750
11751
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11752
11753
Optional arguments start and end are interpreted as in slice notation.
11754
[clinic start generated code]*/
11755
11756
static Py_ssize_t
11757
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11758
                   Py_ssize_t end)
11759
/*[clinic end generated code: output=8fcc3aef0b18edbf input=6f168ffd94be8785]*/
11760
15.9M
{
11761
15.9M
    assert(PyUnicode_Check(str));
11762
15.9M
    assert(PyUnicode_Check(substr));
11763
11764
15.9M
    Py_ssize_t result;
11765
15.9M
    int kind1, kind2;
11766
15.9M
    const void *buf1 = NULL, *buf2 = NULL;
11767
15.9M
    Py_ssize_t len1, len2;
11768
11769
15.9M
    kind1 = PyUnicode_KIND(str);
11770
15.9M
    kind2 = PyUnicode_KIND(substr);
11771
15.9M
    if (kind1 < kind2)
11772
0
        return 0;
11773
11774
15.9M
    len1 = PyUnicode_GET_LENGTH(str);
11775
15.9M
    len2 = PyUnicode_GET_LENGTH(substr);
11776
15.9M
    ADJUST_INDICES(start, end, len1);
11777
15.9M
    if (end - start < len2)
11778
54.4k
        return 0;
11779
11780
15.8M
    buf1 = PyUnicode_DATA(str);
11781
15.8M
    buf2 = PyUnicode_DATA(substr);
11782
15.8M
    if (kind2 != kind1) {
11783
3.36M
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11784
3.36M
        if (!buf2)
11785
0
            goto onError;
11786
3.36M
    }
11787
11788
    // We don't reuse `anylib_count` here because of the explicit casts.
11789
15.8M
    switch (kind1) {
11790
12.4M
    case PyUnicode_1BYTE_KIND:
11791
12.4M
        result = ucs1lib_count(
11792
12.4M
            ((const Py_UCS1*)buf1) + start, end - start,
11793
12.4M
            buf2, len2, PY_SSIZE_T_MAX
11794
12.4M
            );
11795
12.4M
        break;
11796
2.63M
    case PyUnicode_2BYTE_KIND:
11797
2.63M
        result = ucs2lib_count(
11798
2.63M
            ((const Py_UCS2*)buf1) + start, end - start,
11799
2.63M
            buf2, len2, PY_SSIZE_T_MAX
11800
2.63M
            );
11801
2.63M
        break;
11802
728k
    case PyUnicode_4BYTE_KIND:
11803
728k
        result = ucs4lib_count(
11804
728k
            ((const Py_UCS4*)buf1) + start, end - start,
11805
728k
            buf2, len2, PY_SSIZE_T_MAX
11806
728k
            );
11807
728k
        break;
11808
0
    default:
11809
0
        Py_UNREACHABLE();
11810
15.8M
    }
11811
11812
15.8M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11813
15.8M
    if (kind2 != kind1)
11814
3.36M
        PyMem_Free((void *)buf2);
11815
11816
15.8M
    return result;
11817
0
  onError:
11818
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11819
0
    if (kind2 != kind1)
11820
0
        PyMem_Free((void *)buf2);
11821
0
    return -1;
11822
15.8M
}
11823
11824
/*[clinic input]
11825
str.encode as unicode_encode
11826
11827
    encoding: str(c_default="NULL") = 'utf-8'
11828
        The encoding in which to encode the string.
11829
    errors: str(c_default="NULL") = 'strict'
11830
        The error handling scheme to use for encoding errors.
11831
        The default is 'strict' meaning that encoding errors raise a
11832
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11833
        'xmlcharrefreplace' as well as any other name registered with
11834
        codecs.register_error that can handle UnicodeEncodeErrors.
11835
11836
Encode the string using the codec registered for encoding.
11837
[clinic start generated code]*/
11838
11839
static PyObject *
11840
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11841
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11842
13.6M
{
11843
13.6M
    return PyUnicode_AsEncodedString(self, encoding, errors);
11844
13.6M
}
11845
11846
/*[clinic input]
11847
str.expandtabs as unicode_expandtabs
11848
11849
    tabsize: int = 8
11850
11851
Return a copy where all tab characters are expanded using spaces.
11852
11853
If tabsize is not given, a tab size of 8 characters is assumed.
11854
[clinic start generated code]*/
11855
11856
static PyObject *
11857
unicode_expandtabs_impl(PyObject *self, int tabsize)
11858
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11859
6.54M
{
11860
6.54M
    Py_ssize_t i, j, line_pos, src_len, incr;
11861
6.54M
    Py_UCS4 ch;
11862
6.54M
    PyObject *u;
11863
6.54M
    const void *src_data;
11864
6.54M
    void *dest_data;
11865
6.54M
    int kind;
11866
6.54M
    int found;
11867
11868
    /* First pass: determine size of output string */
11869
6.54M
    src_len = PyUnicode_GET_LENGTH(self);
11870
6.54M
    i = j = line_pos = 0;
11871
6.54M
    kind = PyUnicode_KIND(self);
11872
6.54M
    src_data = PyUnicode_DATA(self);
11873
6.54M
    found = 0;
11874
107M
    for (; i < src_len; i++) {
11875
101M
        ch = PyUnicode_READ(kind, src_data, i);
11876
101M
        if (ch == '\t') {
11877
11.7M
            found = 1;
11878
11.7M
            if (tabsize > 0) {
11879
11.7M
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11880
11.7M
                if (j > PY_SSIZE_T_MAX - incr)
11881
0
                    goto overflow;
11882
11.7M
                line_pos += incr;
11883
11.7M
                j += incr;
11884
11.7M
            }
11885
11.7M
        }
11886
89.4M
        else {
11887
89.4M
            if (j > PY_SSIZE_T_MAX - 1)
11888
0
                goto overflow;
11889
89.4M
            line_pos++;
11890
89.4M
            j++;
11891
89.4M
            if (ch == '\n' || ch == '\r')
11892
11.4k
                line_pos = 0;
11893
89.4M
        }
11894
101M
    }
11895
6.54M
    if (!found)
11896
6.43M
        return unicode_result_unchanged(self);
11897
11898
    /* Second pass: create output string and fill it */
11899
110k
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11900
110k
    if (!u)
11901
0
        return NULL;
11902
110k
    dest_data = PyUnicode_DATA(u);
11903
11904
110k
    i = j = line_pos = 0;
11905
11906
25.1M
    for (; i < src_len; i++) {
11907
25.0M
        ch = PyUnicode_READ(kind, src_data, i);
11908
25.0M
        if (ch == '\t') {
11909
11.7M
            if (tabsize > 0) {
11910
11.7M
                incr = tabsize - (line_pos % tabsize);
11911
11.7M
                line_pos += incr;
11912
11.7M
                unicode_fill(kind, dest_data, ' ', j, incr);
11913
11.7M
                j += incr;
11914
11.7M
            }
11915
11.7M
        }
11916
13.2M
        else {
11917
13.2M
            line_pos++;
11918
13.2M
            PyUnicode_WRITE(kind, dest_data, j, ch);
11919
13.2M
            j++;
11920
13.2M
            if (ch == '\n' || ch == '\r')
11921
0
                line_pos = 0;
11922
13.2M
        }
11923
25.0M
    }
11924
110k
    assert (j == PyUnicode_GET_LENGTH(u));
11925
110k
    return unicode_result(u);
11926
11927
0
  overflow:
11928
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11929
0
    return NULL;
11930
110k
}
11931
11932
/*[clinic input]
11933
str.find as unicode_find = str.count
11934
11935
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11936
11937
Optional arguments start and end are interpreted as in slice notation.
11938
Return -1 on failure.
11939
[clinic start generated code]*/
11940
11941
static Py_ssize_t
11942
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11943
                  Py_ssize_t end)
11944
/*[clinic end generated code: output=51dbe6255712e278 input=4a89d2d68ef57256]*/
11945
12.6M
{
11946
12.6M
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11947
12.6M
    if (result < 0) {
11948
185k
        return -1;
11949
185k
    }
11950
12.4M
    return result;
11951
12.6M
}
11952
11953
static PyObject *
11954
unicode_getitem(PyObject *self, Py_ssize_t index)
11955
46.3M
{
11956
46.3M
    const void *data;
11957
46.3M
    int kind;
11958
46.3M
    Py_UCS4 ch;
11959
11960
46.3M
    if (!PyUnicode_Check(self)) {
11961
0
        PyErr_BadArgument();
11962
0
        return NULL;
11963
0
    }
11964
46.3M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11965
467
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11966
467
        return NULL;
11967
467
    }
11968
46.3M
    kind = PyUnicode_KIND(self);
11969
46.3M
    data = PyUnicode_DATA(self);
11970
46.3M
    ch = PyUnicode_READ(kind, data, index);
11971
46.3M
    return unicode_char(ch);
11972
46.3M
}
11973
11974
/* Believe it or not, this produces the same value for ASCII strings
11975
   as bytes_hash(). */
11976
static Py_hash_t
11977
unicode_hash(PyObject *self)
11978
37.5M
{
11979
37.5M
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11980
11981
#ifdef Py_DEBUG
11982
    assert(_Py_HashSecret_Initialized);
11983
#endif
11984
37.5M
    Py_hash_t hash = PyUnicode_HASH(self);
11985
37.5M
    if (hash != -1) {
11986
209k
        return hash;
11987
209k
    }
11988
37.3M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11989
37.3M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11990
11991
37.3M
    PyUnicode_SET_HASH(self, x);
11992
37.3M
    return x;
11993
37.5M
}
11994
11995
/*[clinic input]
11996
str.index as unicode_index = str.count
11997
11998
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11999
12000
Optional arguments start and end are interpreted as in slice notation.
12001
Raises ValueError when the substring is not found.
12002
[clinic start generated code]*/
12003
12004
static Py_ssize_t
12005
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12006
                   Py_ssize_t end)
12007
/*[clinic end generated code: output=77558288837cdf40 input=d986aeac0be14a1c]*/
12008
392k
{
12009
392k
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
12010
392k
    if (result == -1) {
12011
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12012
0
    }
12013
392k
    else if (result < 0) {
12014
0
        return -1;
12015
0
    }
12016
392k
    return result;
12017
392k
}
12018
12019
/*[clinic input]
12020
str.isascii as unicode_isascii
12021
12022
Return True if all characters in the string are ASCII, False otherwise.
12023
12024
ASCII characters have code points in the range U+0000-U+007F.
12025
Empty string is ASCII too.
12026
[clinic start generated code]*/
12027
12028
static PyObject *
12029
unicode_isascii_impl(PyObject *self)
12030
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12031
12.1M
{
12032
12.1M
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12033
12.1M
}
12034
12035
/*[clinic input]
12036
str.islower as unicode_islower
12037
12038
Return True if the string is a lowercase string, False otherwise.
12039
12040
A string is lowercase if all cased characters in the string are lowercase and
12041
there is at least one cased character in the string.
12042
[clinic start generated code]*/
12043
12044
static PyObject *
12045
unicode_islower_impl(PyObject *self)
12046
/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
12047
0
{
12048
0
    Py_ssize_t i, length;
12049
0
    int kind;
12050
0
    const void *data;
12051
0
    int cased;
12052
12053
0
    length = PyUnicode_GET_LENGTH(self);
12054
0
    kind = PyUnicode_KIND(self);
12055
0
    data = PyUnicode_DATA(self);
12056
12057
    /* Shortcut for single character strings */
12058
0
    if (length == 1)
12059
0
        return PyBool_FromLong(
12060
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
12061
12062
    /* Special case for empty strings */
12063
0
    if (length == 0)
12064
0
        Py_RETURN_FALSE;
12065
12066
0
    cased = 0;
12067
0
    for (i = 0; i < length; i++) {
12068
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12069
12070
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
12071
0
            Py_RETURN_FALSE;
12072
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
12073
0
            cased = 1;
12074
0
    }
12075
0
    return PyBool_FromLong(cased);
12076
0
}
12077
12078
/*[clinic input]
12079
str.isupper as unicode_isupper
12080
12081
Return True if the string is an uppercase string, False otherwise.
12082
12083
A string is uppercase if all cased characters in the string are uppercase and
12084
there is at least one cased character in the string.
12085
[clinic start generated code]*/
12086
12087
static PyObject *
12088
unicode_isupper_impl(PyObject *self)
12089
/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
12090
6.96k
{
12091
6.96k
    Py_ssize_t i, length;
12092
6.96k
    int kind;
12093
6.96k
    const void *data;
12094
6.96k
    int cased;
12095
12096
6.96k
    length = PyUnicode_GET_LENGTH(self);
12097
6.96k
    kind = PyUnicode_KIND(self);
12098
6.96k
    data = PyUnicode_DATA(self);
12099
12100
    /* Shortcut for single character strings */
12101
6.96k
    if (length == 1)
12102
0
        return PyBool_FromLong(
12103
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
12104
12105
    /* Special case for empty strings */
12106
6.96k
    if (length == 0)
12107
0
        Py_RETURN_FALSE;
12108
12109
6.96k
    cased = 0;
12110
89.1k
    for (i = 0; i < length; i++) {
12111
82.9k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12112
12113
82.9k
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
12114
824
            Py_RETURN_FALSE;
12115
82.1k
        else if (!cased && Py_UNICODE_ISUPPER(ch))
12116
6.23k
            cased = 1;
12117
82.9k
    }
12118
6.14k
    return PyBool_FromLong(cased);
12119
6.96k
}
12120
12121
/*[clinic input]
12122
str.istitle as unicode_istitle
12123
12124
Return True if the string is a title-cased string, False otherwise.
12125
12126
In a title-cased string, upper- and title-case characters may only
12127
follow uncased characters and lowercase characters only cased ones.
12128
[clinic start generated code]*/
12129
12130
static PyObject *
12131
unicode_istitle_impl(PyObject *self)
12132
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
12133
0
{
12134
0
    Py_ssize_t i, length;
12135
0
    int kind;
12136
0
    const void *data;
12137
0
    int cased, previous_is_cased;
12138
12139
0
    length = PyUnicode_GET_LENGTH(self);
12140
0
    kind = PyUnicode_KIND(self);
12141
0
    data = PyUnicode_DATA(self);
12142
12143
    /* Shortcut for single character strings */
12144
0
    if (length == 1) {
12145
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12146
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12147
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
12148
0
    }
12149
12150
    /* Special case for empty strings */
12151
0
    if (length == 0)
12152
0
        Py_RETURN_FALSE;
12153
12154
0
    cased = 0;
12155
0
    previous_is_cased = 0;
12156
0
    for (i = 0; i < length; i++) {
12157
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12158
12159
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12160
0
            if (previous_is_cased)
12161
0
                Py_RETURN_FALSE;
12162
0
            previous_is_cased = 1;
12163
0
            cased = 1;
12164
0
        }
12165
0
        else if (Py_UNICODE_ISLOWER(ch)) {
12166
0
            if (!previous_is_cased)
12167
0
                Py_RETURN_FALSE;
12168
0
            previous_is_cased = 1;
12169
0
            cased = 1;
12170
0
        }
12171
0
        else
12172
0
            previous_is_cased = 0;
12173
0
    }
12174
0
    return PyBool_FromLong(cased);
12175
0
}
12176
12177
/*[clinic input]
12178
str.isspace as unicode_isspace
12179
12180
Return True if the string is a whitespace string, False otherwise.
12181
12182
A string is whitespace if all characters in the string are whitespace and there
12183
is at least one character in the string.
12184
[clinic start generated code]*/
12185
12186
static PyObject *
12187
unicode_isspace_impl(PyObject *self)
12188
/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
12189
17.4M
{
12190
17.4M
    Py_ssize_t i, length;
12191
17.4M
    int kind;
12192
17.4M
    const void *data;
12193
12194
17.4M
    length = PyUnicode_GET_LENGTH(self);
12195
17.4M
    kind = PyUnicode_KIND(self);
12196
17.4M
    data = PyUnicode_DATA(self);
12197
12198
    /* Shortcut for single character strings */
12199
17.4M
    if (length == 1)
12200
17.4M
        return PyBool_FromLong(
12201
17.4M
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
12202
12203
    /* Special case for empty strings */
12204
0
    if (length == 0)
12205
0
        Py_RETURN_FALSE;
12206
12207
0
    for (i = 0; i < length; i++) {
12208
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12209
0
        if (!Py_UNICODE_ISSPACE(ch))
12210
0
            Py_RETURN_FALSE;
12211
0
    }
12212
0
    Py_RETURN_TRUE;
12213
0
}
12214
12215
/*[clinic input]
12216
str.isalpha as unicode_isalpha
12217
12218
Return True if the string is an alphabetic string, False otherwise.
12219
12220
A string is alphabetic if all characters in the string are alphabetic and there
12221
is at least one character in the string.
12222
[clinic start generated code]*/
12223
12224
static PyObject *
12225
unicode_isalpha_impl(PyObject *self)
12226
/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
12227
0
{
12228
0
    Py_ssize_t i, length;
12229
0
    int kind;
12230
0
    const void *data;
12231
12232
0
    length = PyUnicode_GET_LENGTH(self);
12233
0
    kind = PyUnicode_KIND(self);
12234
0
    data = PyUnicode_DATA(self);
12235
12236
    /* Shortcut for single character strings */
12237
0
    if (length == 1)
12238
0
        return PyBool_FromLong(
12239
0
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
12240
12241
    /* Special case for empty strings */
12242
0
    if (length == 0)
12243
0
        Py_RETURN_FALSE;
12244
12245
0
    for (i = 0; i < length; i++) {
12246
0
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12247
0
            Py_RETURN_FALSE;
12248
0
    }
12249
0
    Py_RETURN_TRUE;
12250
0
}
12251
12252
/*[clinic input]
12253
str.isalnum as unicode_isalnum
12254
12255
Return True if the string is an alpha-numeric string, False otherwise.
12256
12257
A string is alpha-numeric if all characters in the string are alpha-numeric and
12258
there is at least one character in the string.
12259
[clinic start generated code]*/
12260
12261
static PyObject *
12262
unicode_isalnum_impl(PyObject *self)
12263
/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
12264
12.2M
{
12265
12.2M
    int kind;
12266
12.2M
    const void *data;
12267
12.2M
    Py_ssize_t len, i;
12268
12269
12.2M
    kind = PyUnicode_KIND(self);
12270
12.2M
    data = PyUnicode_DATA(self);
12271
12.2M
    len = PyUnicode_GET_LENGTH(self);
12272
12273
    /* Shortcut for single character strings */
12274
12.2M
    if (len == 1) {
12275
12.2M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12276
12.2M
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12277
12.2M
    }
12278
12279
    /* Special case for empty strings */
12280
0
    if (len == 0)
12281
0
        Py_RETURN_FALSE;
12282
12283
0
    for (i = 0; i < len; i++) {
12284
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12285
0
        if (!Py_UNICODE_ISALNUM(ch))
12286
0
            Py_RETURN_FALSE;
12287
0
    }
12288
0
    Py_RETURN_TRUE;
12289
0
}
12290
12291
/*[clinic input]
12292
str.isdecimal as unicode_isdecimal
12293
12294
Return True if the string is a decimal string, False otherwise.
12295
12296
A string is a decimal string if all characters in the string are decimal and
12297
there is at least one character in the string.
12298
[clinic start generated code]*/
12299
12300
static PyObject *
12301
unicode_isdecimal_impl(PyObject *self)
12302
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12303
0
{
12304
0
    Py_ssize_t i, length;
12305
0
    int kind;
12306
0
    const void *data;
12307
12308
0
    length = PyUnicode_GET_LENGTH(self);
12309
0
    kind = PyUnicode_KIND(self);
12310
0
    data = PyUnicode_DATA(self);
12311
12312
    /* Shortcut for single character strings */
12313
0
    if (length == 1)
12314
0
        return PyBool_FromLong(
12315
0
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12316
12317
    /* Special case for empty strings */
12318
0
    if (length == 0)
12319
0
        Py_RETURN_FALSE;
12320
12321
0
    for (i = 0; i < length; i++) {
12322
0
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12323
0
            Py_RETURN_FALSE;
12324
0
    }
12325
0
    Py_RETURN_TRUE;
12326
0
}
12327
12328
/*[clinic input]
12329
str.isdigit as unicode_isdigit
12330
12331
Return True if the string is a digit string, False otherwise.
12332
12333
A string is a digit string if all characters in the string are digits and there
12334
is at least one character in the string.
12335
[clinic start generated code]*/
12336
12337
static PyObject *
12338
unicode_isdigit_impl(PyObject *self)
12339
/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12340
1.45M
{
12341
1.45M
    Py_ssize_t i, length;
12342
1.45M
    int kind;
12343
1.45M
    const void *data;
12344
12345
1.45M
    length = PyUnicode_GET_LENGTH(self);
12346
1.45M
    kind = PyUnicode_KIND(self);
12347
1.45M
    data = PyUnicode_DATA(self);
12348
12349
    /* Shortcut for single character strings */
12350
1.45M
    if (length == 1) {
12351
1.45M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12352
1.45M
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12353
1.45M
    }
12354
12355
    /* Special case for empty strings */
12356
306
    if (length == 0)
12357
0
        Py_RETURN_FALSE;
12358
12359
1.09k
    for (i = 0; i < length; i++) {
12360
786
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12361
0
            Py_RETURN_FALSE;
12362
786
    }
12363
306
    Py_RETURN_TRUE;
12364
306
}
12365
12366
/*[clinic input]
12367
str.isnumeric as unicode_isnumeric
12368
12369
Return True if the string is a numeric string, False otherwise.
12370
12371
A string is numeric if all characters in the string are numeric and there is at
12372
least one character in the string.
12373
[clinic start generated code]*/
12374
12375
static PyObject *
12376
unicode_isnumeric_impl(PyObject *self)
12377
/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12378
0
{
12379
0
    Py_ssize_t i, length;
12380
0
    int kind;
12381
0
    const void *data;
12382
12383
0
    length = PyUnicode_GET_LENGTH(self);
12384
0
    kind = PyUnicode_KIND(self);
12385
0
    data = PyUnicode_DATA(self);
12386
12387
    /* Shortcut for single character strings */
12388
0
    if (length == 1)
12389
0
        return PyBool_FromLong(
12390
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12391
12392
    /* Special case for empty strings */
12393
0
    if (length == 0)
12394
0
        Py_RETURN_FALSE;
12395
12396
0
    for (i = 0; i < length; i++) {
12397
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12398
0
            Py_RETURN_FALSE;
12399
0
    }
12400
0
    Py_RETURN_TRUE;
12401
0
}
12402
12403
Py_ssize_t
12404
_PyUnicode_ScanIdentifier(PyObject *self)
12405
12.8k
{
12406
12.8k
    Py_ssize_t i;
12407
12.8k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12408
12.8k
    if (len == 0) {
12409
        /* an empty string is not a valid identifier */
12410
0
        return 0;
12411
0
    }
12412
12413
12.8k
    int kind = PyUnicode_KIND(self);
12414
12.8k
    const void *data = PyUnicode_DATA(self);
12415
12.8k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12416
    /* PEP 3131 says that the first character must be in
12417
       XID_Start and subsequent characters in XID_Continue,
12418
       and for the ASCII range, the 2.x rules apply (i.e
12419
       start with letters and underscore, continue with
12420
       letters, digits, underscore). However, given the current
12421
       definition of XID_Start and XID_Continue, it is sufficient
12422
       to check just for these, except that _ must be allowed
12423
       as starting an identifier.  */
12424
12.8k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12425
425
        return 0;
12426
425
    }
12427
12428
60.9k
    for (i = 1; i < len; i++) {
12429
48.6k
        ch = PyUnicode_READ(kind, data, i);
12430
48.6k
        if (!_PyUnicode_IsXidContinue(ch)) {
12431
181
            return i;
12432
181
        }
12433
48.6k
    }
12434
12.2k
    return i;
12435
12.4k
}
12436
12437
int
12438
PyUnicode_IsIdentifier(PyObject *self)
12439
590
{
12440
590
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12441
590
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12442
    /* an empty string is not a valid identifier */
12443
590
    return len && i == len;
12444
590
}
12445
12446
/*[clinic input]
12447
str.isidentifier as unicode_isidentifier
12448
12449
Return True if the string is a valid Python identifier, False otherwise.
12450
12451
Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12452
such as "def" or "class".
12453
[clinic start generated code]*/
12454
12455
static PyObject *
12456
unicode_isidentifier_impl(PyObject *self)
12457
/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12458
269
{
12459
269
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12460
269
}
12461
12462
/*[clinic input]
12463
str.isprintable as unicode_isprintable
12464
12465
Return True if all characters in the string are printable, False otherwise.
12466
12467
A character is printable if repr() may use it in its output.
12468
[clinic start generated code]*/
12469
12470
static PyObject *
12471
unicode_isprintable_impl(PyObject *self)
12472
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=4e56bcc6b06ca18c]*/
12473
1.56M
{
12474
1.56M
    Py_ssize_t i, length;
12475
1.56M
    int kind;
12476
1.56M
    const void *data;
12477
12478
1.56M
    length = PyUnicode_GET_LENGTH(self);
12479
1.56M
    kind = PyUnicode_KIND(self);
12480
1.56M
    data = PyUnicode_DATA(self);
12481
12482
    /* Shortcut for single character strings */
12483
1.56M
    if (length == 1)
12484
1.56M
        return PyBool_FromLong(
12485
1.56M
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12486
12487
0
    for (i = 0; i < length; i++) {
12488
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12489
0
            Py_RETURN_FALSE;
12490
0
        }
12491
0
    }
12492
0
    Py_RETURN_TRUE;
12493
0
}
12494
12495
/*[clinic input]
12496
str.join as unicode_join
12497
12498
    iterable: object
12499
    /
12500
12501
Concatenate any number of strings.
12502
12503
The string whose method is called is inserted in between each given string.
12504
The result is returned as a new string.
12505
12506
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12507
[clinic start generated code]*/
12508
12509
static PyObject *
12510
unicode_join(PyObject *self, PyObject *iterable)
12511
/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12512
19.1M
{
12513
19.1M
    return PyUnicode_Join(self, iterable);
12514
19.1M
}
12515
12516
static Py_ssize_t
12517
unicode_length(PyObject *self)
12518
33.1M
{
12519
33.1M
    return PyUnicode_GET_LENGTH(self);
12520
33.1M
}
12521
12522
/*[clinic input]
12523
str.ljust as unicode_ljust
12524
12525
    width: Py_ssize_t
12526
    fillchar: Py_UCS4 = ' '
12527
    /
12528
12529
Return a left-justified string of length width.
12530
12531
Padding is done using the specified fill character (default is a space).
12532
[clinic start generated code]*/
12533
12534
static PyObject *
12535
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12536
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12537
0
{
12538
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12539
0
        return unicode_result_unchanged(self);
12540
12541
0
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12542
0
}
12543
12544
/*[clinic input]
12545
str.lower as unicode_lower
12546
12547
Return a copy of the string converted to lowercase.
12548
[clinic start generated code]*/
12549
12550
static PyObject *
12551
unicode_lower_impl(PyObject *self)
12552
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12553
90.3M
{
12554
90.3M
    if (PyUnicode_IS_ASCII(self))
12555
73.9M
        return ascii_upper_or_lower(self, 1);
12556
16.3M
    return case_operation(self, do_lower);
12557
90.3M
}
12558
12559
52.7M
#define LEFTSTRIP 0
12560
70.0M
#define RIGHTSTRIP 1
12561
31.1M
#define BOTHSTRIP 2
12562
12563
/* Arrays indexed by above */
12564
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12565
12566
0
#define STRIPNAME(i) (stripfuncnames[i])
12567
12568
/* externally visible for str.strip(unicode) */
12569
PyObject *
12570
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12571
7.73M
{
12572
7.73M
    const void *data;
12573
7.73M
    int kind;
12574
7.73M
    Py_ssize_t i, j, len;
12575
7.73M
    BLOOM_MASK sepmask;
12576
7.73M
    Py_ssize_t seplen;
12577
12578
7.73M
    kind = PyUnicode_KIND(self);
12579
7.73M
    data = PyUnicode_DATA(self);
12580
7.73M
    len = PyUnicode_GET_LENGTH(self);
12581
7.73M
    seplen = PyUnicode_GET_LENGTH(sepobj);
12582
7.73M
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12583
7.73M
                              PyUnicode_DATA(sepobj),
12584
7.73M
                              seplen);
12585
12586
7.73M
    i = 0;
12587
7.73M
    if (striptype != RIGHTSTRIP) {
12588
390k
        while (i < len) {
12589
389k
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12590
389k
            if (!BLOOM(sepmask, ch))
12591
358k
                break;
12592
30.7k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12593
1.63k
                break;
12594
29.1k
            i++;
12595
29.1k
        }
12596
361k
    }
12597
12598
7.73M
    j = len;
12599
7.73M
    if (striptype != LEFTSTRIP) {
12600
7.37M
        j--;
12601
7.73M
        while (j >= i) {
12602
2.79M
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12603
2.79M
            if (!BLOOM(sepmask, ch))
12604
2.40M
                break;
12605
384k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12606
23.8k
                break;
12607
360k
            j--;
12608
360k
        }
12609
12610
7.37M
        j++;
12611
7.37M
    }
12612
12613
7.73M
    return PyUnicode_Substring(self, i, j);
12614
7.73M
}
12615
12616
PyObject*
12617
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12618
247M
{
12619
247M
    const unsigned char *data;
12620
247M
    int kind;
12621
247M
    Py_ssize_t length;
12622
12623
247M
    length = PyUnicode_GET_LENGTH(self);
12624
247M
    end = Py_MIN(end, length);
12625
12626
247M
    if (start == 0 && end == length)
12627
45.3M
        return unicode_result_unchanged(self);
12628
12629
202M
    if (start < 0 || end < 0) {
12630
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12631
0
        return NULL;
12632
0
    }
12633
202M
    if (start >= length || end < start)
12634
147k
        _Py_RETURN_UNICODE_EMPTY();
12635
12636
202M
    length = end - start;
12637
202M
    if (PyUnicode_IS_ASCII(self)) {
12638
43.0M
        data = PyUnicode_1BYTE_DATA(self);
12639
43.0M
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12640
43.0M
    }
12641
159M
    else {
12642
159M
        kind = PyUnicode_KIND(self);
12643
159M
        data = PyUnicode_1BYTE_DATA(self);
12644
159M
        return PyUnicode_FromKindAndData(kind,
12645
159M
                                         data + kind * start,
12646
159M
                                         length);
12647
159M
    }
12648
202M
}
12649
12650
static PyObject *
12651
do_strip(PyObject *self, int striptype)
12652
43.5M
{
12653
43.5M
    Py_ssize_t len, i, j;
12654
12655
43.5M
    len = PyUnicode_GET_LENGTH(self);
12656
12657
43.5M
    if (PyUnicode_IS_ASCII(self)) {
12658
36.0M
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12659
12660
36.0M
        i = 0;
12661
36.0M
        if (striptype != RIGHTSTRIP) {
12662
26.2M
            while (i < len) {
12663
16.0M
                Py_UCS1 ch = data[i];
12664
16.0M
                if (!_Py_ascii_whitespace[ch])
12665
15.5M
                    break;
12666
459k
                i++;
12667
459k
            }
12668
25.8M
        }
12669
12670
36.0M
        j = len;
12671
36.0M
        if (striptype != LEFTSTRIP) {
12672
35.7M
            j--;
12673
46.2M
            while (j >= i) {
12674
30.3M
                Py_UCS1 ch = data[j];
12675
30.3M
                if (!_Py_ascii_whitespace[ch])
12676
19.8M
                    break;
12677
10.5M
                j--;
12678
10.5M
            }
12679
35.7M
            j++;
12680
35.7M
        }
12681
36.0M
    }
12682
7.56M
    else {
12683
7.56M
        int kind = PyUnicode_KIND(self);
12684
7.56M
        const void *data = PyUnicode_DATA(self);
12685
12686
7.56M
        i = 0;
12687
7.56M
        if (striptype != RIGHTSTRIP) {
12688
7.75M
            while (i < len) {
12689
7.74M
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12690
7.74M
                if (!Py_UNICODE_ISSPACE(ch))
12691
6.42M
                    break;
12692
1.32M
                i++;
12693
1.32M
            }
12694
6.43M
        }
12695
12696
7.56M
        j = len;
12697
7.56M
        if (striptype != LEFTSTRIP) {
12698
6.76M
            j--;
12699
7.22M
            while (j >= i) {
12700
7.19M
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12701
7.19M
                if (!Py_UNICODE_ISSPACE(ch))
12702
6.73M
                    break;
12703
462k
                j--;
12704
462k
            }
12705
6.76M
            j++;
12706
6.76M
        }
12707
7.56M
    }
12708
12709
43.5M
    return PyUnicode_Substring(self, i, j);
12710
43.5M
}
12711
12712
12713
static PyObject *
12714
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12715
51.3M
{
12716
51.3M
    if (sep != Py_None) {
12717
7.73M
        if (PyUnicode_Check(sep))
12718
7.73M
            return _PyUnicode_XStrip(self, striptype, sep);
12719
0
        else {
12720
0
            PyErr_Format(PyExc_TypeError,
12721
0
                         "%s arg must be None or str",
12722
0
                         STRIPNAME(striptype));
12723
0
            return NULL;
12724
0
        }
12725
7.73M
    }
12726
12727
43.5M
    return do_strip(self, striptype);
12728
51.3M
}
12729
12730
12731
/*[clinic input]
12732
str.strip as unicode_strip
12733
12734
    chars: object = None
12735
    /
12736
12737
Return a copy of the string with leading and trailing whitespace removed.
12738
12739
If chars is given and not None, remove characters in chars instead.
12740
[clinic start generated code]*/
12741
12742
static PyObject *
12743
unicode_strip_impl(PyObject *self, PyObject *chars)
12744
/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12745
31.1M
{
12746
31.1M
    return do_argstrip(self, BOTHSTRIP, chars);
12747
31.1M
}
12748
12749
12750
/*[clinic input]
12751
str.lstrip as unicode_lstrip
12752
12753
    chars: object = None
12754
    /
12755
12756
Return a copy of the string with leading whitespace removed.
12757
12758
If chars is given and not None, remove characters in chars instead.
12759
[clinic start generated code]*/
12760
12761
static PyObject *
12762
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12763
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12764
1.44M
{
12765
1.44M
    return do_argstrip(self, LEFTSTRIP, chars);
12766
1.44M
}
12767
12768
12769
/*[clinic input]
12770
str.rstrip as unicode_rstrip
12771
12772
    chars: object = None
12773
    /
12774
12775
Return a copy of the string with trailing whitespace removed.
12776
12777
If chars is given and not None, remove characters in chars instead.
12778
[clinic start generated code]*/
12779
12780
static PyObject *
12781
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12782
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12783
18.7M
{
12784
18.7M
    return do_argstrip(self, RIGHTSTRIP, chars);
12785
18.7M
}
12786
12787
12788
static PyObject*
12789
unicode_repeat(PyObject *str, Py_ssize_t len)
12790
337k
{
12791
337k
    PyObject *u;
12792
337k
    Py_ssize_t nchars, n;
12793
12794
337k
    if (len < 1)
12795
38.0k
        _Py_RETURN_UNICODE_EMPTY();
12796
12797
    /* no repeat, return original string */
12798
299k
    if (len == 1)
12799
99.6k
        return unicode_result_unchanged(str);
12800
12801
199k
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12802
0
        PyErr_SetString(PyExc_OverflowError,
12803
0
                        "repeated string is too long");
12804
0
        return NULL;
12805
0
    }
12806
199k
    nchars = len * PyUnicode_GET_LENGTH(str);
12807
12808
199k
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12809
199k
    if (!u)
12810
0
        return NULL;
12811
199k
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12812
12813
199k
    if (PyUnicode_GET_LENGTH(str) == 1) {
12814
196k
        int kind = PyUnicode_KIND(str);
12815
196k
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12816
196k
        if (kind == PyUnicode_1BYTE_KIND) {
12817
196k
            void *to = PyUnicode_DATA(u);
12818
196k
            memset(to, (unsigned char)fill_char, len);
12819
196k
        }
12820
0
        else if (kind == PyUnicode_2BYTE_KIND) {
12821
0
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12822
0
            for (n = 0; n < len; ++n)
12823
0
                ucs2[n] = fill_char;
12824
0
        } else {
12825
0
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12826
0
            assert(kind == PyUnicode_4BYTE_KIND);
12827
0
            for (n = 0; n < len; ++n)
12828
0
                ucs4[n] = fill_char;
12829
0
        }
12830
196k
    }
12831
3.18k
    else {
12832
3.18k
        Py_ssize_t char_size = PyUnicode_KIND(str);
12833
3.18k
        char *to = (char *) PyUnicode_DATA(u);
12834
3.18k
        _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12835
3.18k
            PyUnicode_GET_LENGTH(str) * char_size);
12836
3.18k
    }
12837
12838
199k
    assert(_PyUnicode_CheckConsistency(u, 1));
12839
199k
    return u;
12840
199k
}
12841
12842
PyObject *
12843
PyUnicode_Replace(PyObject *str,
12844
                  PyObject *substr,
12845
                  PyObject *replstr,
12846
                  Py_ssize_t maxcount)
12847
1
{
12848
1
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12849
1
            ensure_unicode(replstr) < 0)
12850
0
        return NULL;
12851
1
    return replace(str, substr, replstr, maxcount);
12852
1
}
12853
12854
/*[clinic input]
12855
str.replace as unicode_replace
12856
12857
    old: unicode
12858
    new: unicode
12859
    /
12860
    count: Py_ssize_t = -1
12861
        Maximum number of occurrences to replace.
12862
        -1 (the default value) means replace all occurrences.
12863
12864
Return a copy with all occurrences of substring old replaced by new.
12865
12866
If the optional argument count is given, only the first count occurrences are
12867
replaced.
12868
[clinic start generated code]*/
12869
12870
static PyObject *
12871
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12872
                     Py_ssize_t count)
12873
/*[clinic end generated code: output=b63f1a8b5eebf448 input=3345c455d60a5499]*/
12874
77.0M
{
12875
77.0M
    return replace(self, old, new, count);
12876
77.0M
}
12877
12878
/*[clinic input]
12879
str.removeprefix as unicode_removeprefix
12880
12881
    prefix: unicode
12882
    /
12883
12884
Return a str with the given prefix string removed if present.
12885
12886
If the string starts with the prefix string, return string[len(prefix):].
12887
Otherwise, return a copy of the original string.
12888
[clinic start generated code]*/
12889
12890
static PyObject *
12891
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12892
/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12893
0
{
12894
0
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12895
0
    if (match == -1) {
12896
0
        return NULL;
12897
0
    }
12898
0
    if (match) {
12899
0
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12900
0
                                   PyUnicode_GET_LENGTH(self));
12901
0
    }
12902
0
    return unicode_result_unchanged(self);
12903
0
}
12904
12905
/*[clinic input]
12906
str.removesuffix as unicode_removesuffix
12907
12908
    suffix: unicode
12909
    /
12910
12911
Return a str with the given suffix string removed if present.
12912
12913
If the string ends with the suffix string and that suffix is not empty,
12914
return string[:-len(suffix)]. Otherwise, return a copy of the original
12915
string.
12916
[clinic start generated code]*/
12917
12918
static PyObject *
12919
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12920
/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12921
0
{
12922
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12923
0
    if (match == -1) {
12924
0
        return NULL;
12925
0
    }
12926
0
    if (match) {
12927
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12928
0
                                            - PyUnicode_GET_LENGTH(suffix));
12929
0
    }
12930
0
    return unicode_result_unchanged(self);
12931
0
}
12932
12933
static PyObject *
12934
unicode_repr(PyObject *unicode)
12935
3.66M
{
12936
3.66M
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12937
3.66M
    const void *idata = PyUnicode_DATA(unicode);
12938
12939
    /* Compute length of output, quote characters, and
12940
       maximum character */
12941
3.66M
    Py_ssize_t osize = 0;
12942
3.66M
    Py_UCS4 maxch = 127;
12943
3.66M
    Py_ssize_t squote = 0;
12944
3.66M
    Py_ssize_t dquote = 0;
12945
3.66M
    int ikind = PyUnicode_KIND(unicode);
12946
145M
    for (Py_ssize_t i = 0; i < isize; i++) {
12947
141M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12948
141M
        Py_ssize_t incr = 1;
12949
141M
        switch (ch) {
12950
206k
        case '\'': squote++; break;
12951
673k
        case '"':  dquote++; break;
12952
227k
        case '\\': case '\t': case '\r': case '\n':
12953
227k
            incr = 2;
12954
227k
            break;
12955
140M
        default:
12956
            /* Fast-path ASCII */
12957
140M
            if (ch < ' ' || ch == 0x7f)
12958
90.5M
                incr = 4; /* \xHH */
12959
50.2M
            else if (ch < 0x7f)
12960
41.6M
                ;
12961
8.57M
            else if (Py_UNICODE_ISPRINTABLE(ch))
12962
8.46M
                maxch = (ch > maxch) ? ch : maxch;
12963
109k
            else if (ch < 0x100)
12964
22.1k
                incr = 4; /* \xHH */
12965
87.3k
            else if (ch < 0x10000)
12966
49.8k
                incr = 6; /* \uHHHH */
12967
37.5k
            else
12968
37.5k
                incr = 10; /* \uHHHHHHHH */
12969
141M
        }
12970
141M
        if (osize > PY_SSIZE_T_MAX - incr) {
12971
0
            PyErr_SetString(PyExc_OverflowError,
12972
0
                            "string is too long to generate repr");
12973
0
            return NULL;
12974
0
        }
12975
141M
        osize += incr;
12976
141M
    }
12977
12978
3.66M
    Py_UCS4 quote = '\'';
12979
3.66M
    int changed = (osize != isize);
12980
3.66M
    if (squote) {
12981
70.2k
        changed = 1;
12982
70.2k
        if (dquote)
12983
            /* Both squote and dquote present. Use squote,
12984
               and escape them */
12985
5.97k
            osize += squote;
12986
64.3k
        else
12987
64.3k
            quote = '"';
12988
70.2k
    }
12989
3.66M
    osize += 2;   /* quotes */
12990
12991
3.66M
    PyObject *repr = PyUnicode_New(osize, maxch);
12992
3.66M
    if (repr == NULL)
12993
0
        return NULL;
12994
3.66M
    int okind = PyUnicode_KIND(repr);
12995
3.66M
    void *odata = PyUnicode_DATA(repr);
12996
12997
3.66M
    if (!changed) {
12998
3.11M
        PyUnicode_WRITE(okind, odata, 0, quote);
12999
13000
3.11M
        _PyUnicode_FastCopyCharacters(repr, 1,
13001
3.11M
                                      unicode, 0,
13002
3.11M
                                      isize);
13003
13004
3.11M
        PyUnicode_WRITE(okind, odata, osize-1, quote);
13005
3.11M
    }
13006
546k
    else {
13007
546k
        switch (okind) {
13008
356k
        case PyUnicode_1BYTE_KIND:
13009
356k
            ucs1lib_repr(unicode, quote, odata);
13010
356k
            break;
13011
185k
        case PyUnicode_2BYTE_KIND:
13012
185k
            ucs2lib_repr(unicode, quote, odata);
13013
185k
            break;
13014
4.01k
        default:
13015
4.01k
            assert(okind == PyUnicode_4BYTE_KIND);
13016
4.01k
            ucs4lib_repr(unicode, quote, odata);
13017
546k
        }
13018
546k
    }
13019
13020
3.66M
    assert(_PyUnicode_CheckConsistency(repr, 1));
13021
3.66M
    return repr;
13022
3.66M
}
13023
13024
/*[clinic input]
13025
str.rfind as unicode_rfind = str.count
13026
13027
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
13028
13029
Optional arguments start and end are interpreted as in slice notation.
13030
Return -1 on failure.
13031
[clinic start generated code]*/
13032
13033
static Py_ssize_t
13034
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
13035
                   Py_ssize_t end)
13036
/*[clinic end generated code: output=880b29f01dd014c8 input=898361fb71f59294]*/
13037
9.46k
{
13038
9.46k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
13039
9.46k
    if (result < 0) {
13040
6.44k
        return -1;
13041
6.44k
    }
13042
3.01k
    return result;
13043
9.46k
}
13044
13045
/*[clinic input]
13046
str.rindex as unicode_rindex = str.count
13047
13048
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
13049
13050
Optional arguments start and end are interpreted as in slice notation.
13051
Raises ValueError when the substring is not found.
13052
[clinic start generated code]*/
13053
13054
static Py_ssize_t
13055
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
13056
                    Py_ssize_t end)
13057
/*[clinic end generated code: output=5f3aef124c867fe1 input=35943dead6c1ea9d]*/
13058
186k
{
13059
186k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
13060
186k
    if (result == -1) {
13061
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
13062
0
    }
13063
186k
    else if (result < 0) {
13064
0
        return -1;
13065
0
    }
13066
186k
    return result;
13067
186k
}
13068
13069
/*[clinic input]
13070
str.rjust as unicode_rjust
13071
13072
    width: Py_ssize_t
13073
    fillchar: Py_UCS4 = ' '
13074
    /
13075
13076
Return a right-justified string of length width.
13077
13078
Padding is done using the specified fill character (default is a space).
13079
[clinic start generated code]*/
13080
13081
static PyObject *
13082
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13083
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
13084
0
{
13085
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13086
0
        return unicode_result_unchanged(self);
13087
13088
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
13089
0
}
13090
13091
PyObject *
13092
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13093
0
{
13094
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13095
0
        return NULL;
13096
13097
0
    return split(s, sep, maxsplit);
13098
0
}
13099
13100
/*[clinic input]
13101
str.split as unicode_split
13102
13103
    sep: object = None
13104
        The separator used to split the string.
13105
13106
        When set to None (the default value), will split on any whitespace
13107
        character (including \n \r \t \f and spaces) and will discard
13108
        empty strings from the result.
13109
    maxsplit: Py_ssize_t = -1
13110
        Maximum number of splits.
13111
        -1 (the default value) means no limit.
13112
13113
Return a list of the substrings in the string, using sep as the separator string.
13114
13115
Splitting starts at the front of the string and works to the end.
13116
13117
Note, str.split() is mainly useful for data that has been intentionally
13118
delimited.  With natural text that includes punctuation, consider using
13119
the regular expression module.
13120
13121
[clinic start generated code]*/
13122
13123
static PyObject *
13124
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13125
/*[clinic end generated code: output=3a65b1db356948dc input=a29bcc0c7a5af0eb]*/
13126
19.8M
{
13127
19.8M
    if (sep == Py_None)
13128
160k
        return split(self, NULL, maxsplit);
13129
19.6M
    if (PyUnicode_Check(sep))
13130
19.6M
        return split(self, sep, maxsplit);
13131
13132
0
    PyErr_Format(PyExc_TypeError,
13133
0
                 "must be str or None, not %.100s",
13134
0
                 Py_TYPE(sep)->tp_name);
13135
0
    return NULL;
13136
19.6M
}
13137
13138
PyObject *
13139
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
13140
6.11M
{
13141
6.11M
    PyObject* out;
13142
6.11M
    int kind1, kind2;
13143
6.11M
    const void *buf1, *buf2;
13144
6.11M
    Py_ssize_t len1, len2;
13145
13146
6.11M
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13147
0
        return NULL;
13148
13149
6.11M
    kind1 = PyUnicode_KIND(str_obj);
13150
6.11M
    kind2 = PyUnicode_KIND(sep_obj);
13151
6.11M
    len1 = PyUnicode_GET_LENGTH(str_obj);
13152
6.11M
    len2 = PyUnicode_GET_LENGTH(sep_obj);
13153
6.11M
    if (kind1 < kind2 || len1 < len2) {
13154
741
        PyObject *empty = unicode_get_empty();  // Borrowed reference
13155
741
        return PyTuple_Pack(3, str_obj, empty, empty);
13156
741
    }
13157
6.11M
    buf1 = PyUnicode_DATA(str_obj);
13158
6.11M
    buf2 = PyUnicode_DATA(sep_obj);
13159
6.11M
    if (kind2 != kind1) {
13160
75.1k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
13161
75.1k
        if (!buf2)
13162
0
            return NULL;
13163
75.1k
    }
13164
13165
6.11M
    switch (kind1) {
13166
6.04M
    case PyUnicode_1BYTE_KIND:
13167
6.04M
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13168
2.03M
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13169
4.00M
        else
13170
4.00M
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13171
6.04M
        break;
13172
66.4k
    case PyUnicode_2BYTE_KIND:
13173
66.4k
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13174
66.4k
        break;
13175
8.77k
    case PyUnicode_4BYTE_KIND:
13176
8.77k
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13177
8.77k
        break;
13178
0
    default:
13179
0
        Py_UNREACHABLE();
13180
6.11M
    }
13181
13182
6.11M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13183
6.11M
    if (kind2 != kind1)
13184
75.1k
        PyMem_Free((void *)buf2);
13185
13186
6.11M
    return out;
13187
6.11M
}
13188
13189
13190
PyObject *
13191
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
13192
9.02k
{
13193
9.02k
    PyObject* out;
13194
9.02k
    int kind1, kind2;
13195
9.02k
    const void *buf1, *buf2;
13196
9.02k
    Py_ssize_t len1, len2;
13197
13198
9.02k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13199
0
        return NULL;
13200
13201
9.02k
    kind1 = PyUnicode_KIND(str_obj);
13202
9.02k
    kind2 = PyUnicode_KIND(sep_obj);
13203
9.02k
    len1 = PyUnicode_GET_LENGTH(str_obj);
13204
9.02k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
13205
9.02k
    if (kind1 < kind2 || len1 < len2) {
13206
0
        PyObject *empty = unicode_get_empty();  // Borrowed reference
13207
0
        return PyTuple_Pack(3, empty, empty, str_obj);
13208
0
    }
13209
9.02k
    buf1 = PyUnicode_DATA(str_obj);
13210
9.02k
    buf2 = PyUnicode_DATA(sep_obj);
13211
9.02k
    if (kind2 != kind1) {
13212
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
13213
0
        if (!buf2)
13214
0
            return NULL;
13215
0
    }
13216
13217
9.02k
    switch (kind1) {
13218
9.02k
    case PyUnicode_1BYTE_KIND:
13219
9.02k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13220
9.02k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13221
0
        else
13222
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13223
9.02k
        break;
13224
0
    case PyUnicode_2BYTE_KIND:
13225
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13226
0
        break;
13227
0
    case PyUnicode_4BYTE_KIND:
13228
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13229
0
        break;
13230
0
    default:
13231
0
        Py_UNREACHABLE();
13232
9.02k
    }
13233
13234
9.02k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13235
9.02k
    if (kind2 != kind1)
13236
0
        PyMem_Free((void *)buf2);
13237
13238
9.02k
    return out;
13239
9.02k
}
13240
13241
/*[clinic input]
13242
str.partition as unicode_partition
13243
13244
    sep: object
13245
    /
13246
13247
Partition the string into three parts using the given separator.
13248
13249
This will search for the separator in the string.  If the separator is found,
13250
returns a 3-tuple containing the part before the separator, the separator
13251
itself, and the part after it.
13252
13253
If the separator is not found, returns a 3-tuple containing the original string
13254
and two empty strings.
13255
[clinic start generated code]*/
13256
13257
static PyObject *
13258
unicode_partition(PyObject *self, PyObject *sep)
13259
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13260
6.11M
{
13261
6.11M
    return PyUnicode_Partition(self, sep);
13262
6.11M
}
13263
13264
/*[clinic input]
13265
str.rpartition as unicode_rpartition = str.partition
13266
13267
Partition the string into three parts using the given separator.
13268
13269
This will search for the separator in the string, starting at the end. If
13270
the separator is found, returns a 3-tuple containing the part before the
13271
separator, the separator itself, and the part after it.
13272
13273
If the separator is not found, returns a 3-tuple containing two empty strings
13274
and the original string.
13275
[clinic start generated code]*/
13276
13277
static PyObject *
13278
unicode_rpartition(PyObject *self, PyObject *sep)
13279
/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13280
9.02k
{
13281
9.02k
    return PyUnicode_RPartition(self, sep);
13282
9.02k
}
13283
13284
PyObject *
13285
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13286
0
{
13287
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13288
0
        return NULL;
13289
13290
0
    return rsplit(s, sep, maxsplit);
13291
0
}
13292
13293
/*[clinic input]
13294
str.rsplit as unicode_rsplit = str.split
13295
13296
Return a list of the substrings in the string, using sep as the separator string.
13297
13298
Splitting starts at the end of the string and works to the front.
13299
[clinic start generated code]*/
13300
13301
static PyObject *
13302
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13303
/*[clinic end generated code: output=c2b815c63bcabffc input=ea78406060fce33c]*/
13304
50
{
13305
50
    if (sep == Py_None)
13306
0
        return rsplit(self, NULL, maxsplit);
13307
50
    if (PyUnicode_Check(sep))
13308
50
        return rsplit(self, sep, maxsplit);
13309
13310
0
    PyErr_Format(PyExc_TypeError,
13311
0
                 "must be str or None, not %.100s",
13312
0
                 Py_TYPE(sep)->tp_name);
13313
0
    return NULL;
13314
50
}
13315
13316
/*[clinic input]
13317
str.splitlines as unicode_splitlines
13318
13319
    keepends: bool = False
13320
13321
Return a list of the lines in the string, breaking at line boundaries.
13322
13323
Line breaks are not included in the resulting list unless keepends is given and
13324
true.
13325
[clinic start generated code]*/
13326
13327
static PyObject *
13328
unicode_splitlines_impl(PyObject *self, int keepends)
13329
/*[clinic end generated code: output=f664dcdad153ec40 input=ba6ad05ee85d2b55]*/
13330
13.3k
{
13331
13.3k
    return PyUnicode_Splitlines(self, keepends);
13332
13.3k
}
13333
13334
static
13335
PyObject *unicode_str(PyObject *self)
13336
3.01M
{
13337
3.01M
    return unicode_result_unchanged(self);
13338
3.01M
}
13339
13340
/*[clinic input]
13341
str.swapcase as unicode_swapcase
13342
13343
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13344
[clinic start generated code]*/
13345
13346
static PyObject *
13347
unicode_swapcase_impl(PyObject *self)
13348
/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13349
0
{
13350
0
    return case_operation(self, do_swapcase);
13351
0
}
13352
13353
/*[clinic input]
13354
13355
@staticmethod
13356
str.maketrans as unicode_maketrans
13357
13358
  x: object
13359
13360
  y: unicode=NULL
13361
13362
  z: unicode=NULL
13363
13364
  /
13365
13366
Return a translation table usable for str.translate().
13367
13368
If there is only one argument, it must be a dictionary mapping Unicode
13369
ordinals (integers) or characters to Unicode ordinals, strings or None.
13370
Character keys will be then converted to ordinals.
13371
If there are two arguments, they must be strings of equal length, and
13372
in the resulting dictionary, each character in x will be mapped to the
13373
character at the same position in y. If there is a third argument, it
13374
must be a string, whose characters will be mapped to None in the result.
13375
[clinic start generated code]*/
13376
13377
static PyObject *
13378
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13379
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13380
0
{
13381
0
    PyObject *new = NULL, *key, *value;
13382
0
    Py_ssize_t i = 0;
13383
0
    int res;
13384
13385
0
    new = PyDict_New();
13386
0
    if (!new)
13387
0
        return NULL;
13388
0
    if (y != NULL) {
13389
0
        int x_kind, y_kind, z_kind;
13390
0
        const void *x_data, *y_data, *z_data;
13391
13392
        /* x must be a string too, of equal length */
13393
0
        if (!PyUnicode_Check(x)) {
13394
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13395
0
                            "be a string if there is a second argument");
13396
0
            goto err;
13397
0
        }
13398
0
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13399
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13400
0
                            "arguments must have equal length");
13401
0
            goto err;
13402
0
        }
13403
        /* create entries for translating chars in x to those in y */
13404
0
        x_kind = PyUnicode_KIND(x);
13405
0
        y_kind = PyUnicode_KIND(y);
13406
0
        x_data = PyUnicode_DATA(x);
13407
0
        y_data = PyUnicode_DATA(y);
13408
0
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13409
0
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13410
0
            if (!key)
13411
0
                goto err;
13412
0
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13413
0
            if (!value) {
13414
0
                Py_DECREF(key);
13415
0
                goto err;
13416
0
            }
13417
0
            res = PyDict_SetItem(new, key, value);
13418
0
            Py_DECREF(key);
13419
0
            Py_DECREF(value);
13420
0
            if (res < 0)
13421
0
                goto err;
13422
0
        }
13423
        /* create entries for deleting chars in z */
13424
0
        if (z != NULL) {
13425
0
            z_kind = PyUnicode_KIND(z);
13426
0
            z_data = PyUnicode_DATA(z);
13427
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13428
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13429
0
                if (!key)
13430
0
                    goto err;
13431
0
                res = PyDict_SetItem(new, key, Py_None);
13432
0
                Py_DECREF(key);
13433
0
                if (res < 0)
13434
0
                    goto err;
13435
0
            }
13436
0
        }
13437
0
    } else {
13438
0
        int kind;
13439
0
        const void *data;
13440
13441
        /* x must be a dict */
13442
0
        if (!PyDict_CheckExact(x)) {
13443
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13444
0
                            "to maketrans it must be a dict");
13445
0
            goto err;
13446
0
        }
13447
        /* copy entries into the new dict, converting string keys to int keys */
13448
0
        while (PyDict_Next(x, &i, &key, &value)) {
13449
0
            if (PyUnicode_Check(key)) {
13450
                /* convert string keys to integer keys */
13451
0
                PyObject *newkey;
13452
0
                if (PyUnicode_GET_LENGTH(key) != 1) {
13453
0
                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13454
0
                                    "table must be of length 1");
13455
0
                    goto err;
13456
0
                }
13457
0
                kind = PyUnicode_KIND(key);
13458
0
                data = PyUnicode_DATA(key);
13459
0
                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13460
0
                if (!newkey)
13461
0
                    goto err;
13462
0
                res = PyDict_SetItem(new, newkey, value);
13463
0
                Py_DECREF(newkey);
13464
0
                if (res < 0)
13465
0
                    goto err;
13466
0
            } else if (PyLong_Check(key)) {
13467
                /* just keep integer keys */
13468
0
                if (PyDict_SetItem(new, key, value) < 0)
13469
0
                    goto err;
13470
0
            } else {
13471
0
                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13472
0
                                "be strings or integers");
13473
0
                goto err;
13474
0
            }
13475
0
        }
13476
0
    }
13477
0
    return new;
13478
0
  err:
13479
0
    Py_DECREF(new);
13480
0
    return NULL;
13481
0
}
13482
13483
/*[clinic input]
13484
str.translate as unicode_translate
13485
13486
    table: object
13487
        Translation table, which must be a mapping of Unicode ordinals to
13488
        Unicode ordinals, strings, or None.
13489
    /
13490
13491
Replace each character in the string using the given translation table.
13492
13493
The table must implement lookup/indexing via __getitem__, for instance a
13494
dictionary or list.  If this operation raises LookupError, the character is
13495
left untouched.  Characters mapped to None are deleted.
13496
[clinic start generated code]*/
13497
13498
static PyObject *
13499
unicode_translate(PyObject *self, PyObject *table)
13500
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13501
8
{
13502
8
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13503
8
}
13504
13505
/*[clinic input]
13506
str.upper as unicode_upper
13507
13508
Return a copy of the string converted to uppercase.
13509
[clinic start generated code]*/
13510
13511
static PyObject *
13512
unicode_upper_impl(PyObject *self)
13513
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13514
0
{
13515
0
    if (PyUnicode_IS_ASCII(self))
13516
0
        return ascii_upper_or_lower(self, 0);
13517
0
    return case_operation(self, do_upper);
13518
0
}
13519
13520
/*[clinic input]
13521
str.zfill as unicode_zfill
13522
13523
    width: Py_ssize_t
13524
    /
13525
13526
Pad a numeric string with zeros on the left, to fill a field of the given width.
13527
13528
The string is never truncated.
13529
[clinic start generated code]*/
13530
13531
static PyObject *
13532
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13533
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13534
0
{
13535
0
    Py_ssize_t fill;
13536
0
    PyObject *u;
13537
0
    int kind;
13538
0
    const void *data;
13539
0
    Py_UCS4 chr;
13540
13541
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13542
0
        return unicode_result_unchanged(self);
13543
13544
0
    fill = width - PyUnicode_GET_LENGTH(self);
13545
13546
0
    u = pad(self, fill, 0, '0');
13547
13548
0
    if (u == NULL)
13549
0
        return NULL;
13550
13551
0
    kind = PyUnicode_KIND(u);
13552
0
    data = PyUnicode_DATA(u);
13553
0
    chr = PyUnicode_READ(kind, data, fill);
13554
13555
0
    if (chr == '+' || chr == '-') {
13556
        /* move sign to beginning of string */
13557
0
        PyUnicode_WRITE(kind, data, 0, chr);
13558
0
        PyUnicode_WRITE(kind, data, fill, '0');
13559
0
    }
13560
13561
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13562
0
    return u;
13563
0
}
13564
13565
/*[clinic input]
13566
@text_signature "($self, prefix[, start[, end]], /)"
13567
str.startswith as unicode_startswith
13568
13569
    prefix as subobj: object
13570
        A string or a tuple of strings to try.
13571
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13572
        Optional start position. Default: start of the string.
13573
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13574
        Optional stop position. Default: end of the string.
13575
    /
13576
13577
Return True if the string starts with the specified prefix, False otherwise.
13578
[clinic start generated code]*/
13579
13580
static PyObject *
13581
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13582
                        Py_ssize_t end)
13583
/*[clinic end generated code: output=4bd7cfd0803051d4 input=5f918b5f5f89d856]*/
13584
65.5M
{
13585
65.5M
    if (PyTuple_Check(subobj)) {
13586
8.61M
        Py_ssize_t i;
13587
31.3M
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13588
22.8M
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13589
22.8M
            if (!PyUnicode_Check(substring)) {
13590
0
                PyErr_Format(PyExc_TypeError,
13591
0
                             "tuple for startswith must only contain str, "
13592
0
                             "not %.100s",
13593
0
                             Py_TYPE(substring)->tp_name);
13594
0
                return NULL;
13595
0
            }
13596
22.8M
            int result = tailmatch(self, substring, start, end, -1);
13597
22.8M
            if (result < 0) {
13598
0
                return NULL;
13599
0
            }
13600
22.8M
            if (result) {
13601
43.1k
                Py_RETURN_TRUE;
13602
43.1k
            }
13603
22.8M
        }
13604
        /* nothing matched */
13605
8.61M
        Py_RETURN_FALSE;
13606
8.61M
    }
13607
56.8M
    if (!PyUnicode_Check(subobj)) {
13608
0
        PyErr_Format(PyExc_TypeError,
13609
0
                     "startswith first arg must be str or "
13610
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13611
0
        return NULL;
13612
0
    }
13613
56.8M
    int result = tailmatch(self, subobj, start, end, -1);
13614
56.8M
    if (result < 0) {
13615
0
        return NULL;
13616
0
    }
13617
56.8M
    return PyBool_FromLong(result);
13618
56.8M
}
13619
13620
13621
/*[clinic input]
13622
@text_signature "($self, suffix[, start[, end]], /)"
13623
str.endswith as unicode_endswith
13624
13625
    suffix as subobj: object
13626
        A string or a tuple of strings to try.
13627
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13628
        Optional start position. Default: start of the string.
13629
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13630
        Optional stop position. Default: end of the string.
13631
    /
13632
13633
Return True if the string ends with the specified suffix, False otherwise.
13634
[clinic start generated code]*/
13635
13636
static PyObject *
13637
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13638
                      Py_ssize_t end)
13639
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=00fbdc774a7d4d71]*/
13640
12.5M
{
13641
12.5M
    if (PyTuple_Check(subobj)) {
13642
182k
        Py_ssize_t i;
13643
342k
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13644
321k
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13645
321k
            if (!PyUnicode_Check(substring)) {
13646
0
                PyErr_Format(PyExc_TypeError,
13647
0
                             "tuple for endswith must only contain str, "
13648
0
                             "not %.100s",
13649
0
                             Py_TYPE(substring)->tp_name);
13650
0
                return NULL;
13651
0
            }
13652
321k
            int result = tailmatch(self, substring, start, end, +1);
13653
321k
            if (result < 0) {
13654
0
                return NULL;
13655
0
            }
13656
321k
            if (result) {
13657
160k
                Py_RETURN_TRUE;
13658
160k
            }
13659
321k
        }
13660
182k
        Py_RETURN_FALSE;
13661
182k
    }
13662
12.3M
    if (!PyUnicode_Check(subobj)) {
13663
0
        PyErr_Format(PyExc_TypeError,
13664
0
                     "endswith first arg must be str or "
13665
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13666
0
        return NULL;
13667
0
    }
13668
12.3M
    int result = tailmatch(self, subobj, start, end, +1);
13669
12.3M
    if (result < 0) {
13670
0
        return NULL;
13671
0
    }
13672
12.3M
    return PyBool_FromLong(result);
13673
12.3M
}
13674
13675
13676
static inline void
13677
_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13678
59.9M
{
13679
59.9M
    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13680
59.9M
    writer->data = PyUnicode_DATA(writer->buffer);
13681
13682
59.9M
    if (!writer->readonly) {
13683
59.9M
        writer->kind = PyUnicode_KIND(writer->buffer);
13684
59.9M
        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13685
59.9M
    }
13686
13.8k
    else {
13687
        /* use a value smaller than PyUnicode_1BYTE_KIND() so
13688
           _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13689
13.8k
        writer->kind = 0;
13690
13.8k
        assert(writer->kind <= PyUnicode_1BYTE_KIND);
13691
13692
        /* Copy-on-write mode: set buffer size to 0 so
13693
         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13694
         * next write. */
13695
13.8k
        writer->size = 0;
13696
13.8k
    }
13697
59.9M
}
13698
13699
13700
void
13701
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13702
48.0M
{
13703
48.0M
    memset(writer, 0, sizeof(*writer));
13704
13705
    /* ASCII is the bare minimum */
13706
48.0M
    writer->min_char = 127;
13707
13708
    /* use a kind value smaller than PyUnicode_1BYTE_KIND so
13709
       _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13710
48.0M
    assert(writer->kind == 0);
13711
48.0M
    assert(writer->kind < PyUnicode_1BYTE_KIND);
13712
48.0M
}
13713
13714
13715
PyUnicodeWriter*
13716
PyUnicodeWriter_Create(Py_ssize_t length)
13717
4.21M
{
13718
4.21M
    if (length < 0) {
13719
0
        PyErr_SetString(PyExc_ValueError,
13720
0
                        "length must be positive");
13721
0
        return NULL;
13722
0
    }
13723
13724
4.21M
    const size_t size = sizeof(_PyUnicodeWriter);
13725
4.21M
    PyUnicodeWriter *pub_writer;
13726
4.21M
    pub_writer = _Py_FREELIST_POP_MEM(unicode_writers);
13727
4.21M
    if (pub_writer == NULL) {
13728
2.34M
        pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
13729
2.34M
        if (pub_writer == NULL) {
13730
0
            return (PyUnicodeWriter *)PyErr_NoMemory();
13731
0
        }
13732
2.34M
    }
13733
4.21M
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
13734
13735
4.21M
    _PyUnicodeWriter_Init(writer);
13736
4.21M
    if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
13737
0
        PyUnicodeWriter_Discard(pub_writer);
13738
0
        return NULL;
13739
0
    }
13740
4.21M
    writer->overallocate = 1;
13741
13742
4.21M
    return pub_writer;
13743
4.21M
}
13744
13745
13746
void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
13747
61.6k
{
13748
61.6k
    if (writer == NULL) {
13749
61.2k
        return;
13750
61.2k
    }
13751
479
    _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
13752
479
    _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
13753
479
}
13754
13755
13756
// Initialize _PyUnicodeWriter with initial buffer
13757
static inline void
13758
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13759
467k
{
13760
467k
    memset(writer, 0, sizeof(*writer));
13761
467k
    writer->buffer = buffer;
13762
467k
    _PyUnicodeWriter_Update(writer);
13763
467k
    writer->min_length = writer->size;
13764
467k
}
13765
13766
13767
int
13768
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13769
                                 Py_ssize_t length, Py_UCS4 maxchar)
13770
59.4M
{
13771
59.4M
    Py_ssize_t newlen;
13772
59.4M
    PyObject *newbuffer;
13773
13774
59.4M
    assert(length >= 0);
13775
59.4M
    assert(maxchar <= MAX_UNICODE);
13776
13777
    /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13778
59.4M
    assert((maxchar > writer->maxchar && length >= 0)
13779
59.4M
           || length > 0);
13780
13781
59.4M
    if (length > PY_SSIZE_T_MAX - writer->pos) {
13782
0
        PyErr_NoMemory();
13783
0
        return -1;
13784
0
    }
13785
59.4M
    newlen = writer->pos + length;
13786
13787
59.4M
    maxchar = Py_MAX(maxchar, writer->min_char);
13788
13789
59.4M
    if (writer->buffer == NULL) {
13790
43.4M
        assert(!writer->readonly);
13791
43.4M
        if (writer->overallocate
13792
43.4M
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13793
            /* overallocate to limit the number of realloc() */
13794
32.8M
            newlen += newlen / OVERALLOCATE_FACTOR;
13795
32.8M
        }
13796
43.4M
        if (newlen < writer->min_length)
13797
39.1M
            newlen = writer->min_length;
13798
13799
43.4M
        writer->buffer = PyUnicode_New(newlen, maxchar);
13800
43.4M
        if (writer->buffer == NULL)
13801
0
            return -1;
13802
43.4M
    }
13803
16.0M
    else if (newlen > writer->size) {
13804
13.5M
        if (writer->overallocate
13805
13.5M
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13806
            /* overallocate to limit the number of realloc() */
13807
13.2M
            newlen += newlen / OVERALLOCATE_FACTOR;
13808
13.2M
        }
13809
13.5M
        if (newlen < writer->min_length)
13810
952
            newlen = writer->min_length;
13811
13812
13.5M
        if (maxchar > writer->maxchar || writer->readonly) {
13813
            /* resize + widen */
13814
3.16M
            maxchar = Py_MAX(maxchar, writer->maxchar);
13815
3.16M
            newbuffer = PyUnicode_New(newlen, maxchar);
13816
3.16M
            if (newbuffer == NULL)
13817
0
                return -1;
13818
3.16M
            _PyUnicode_FastCopyCharacters(newbuffer, 0,
13819
3.16M
                                          writer->buffer, 0, writer->pos);
13820
3.16M
            Py_DECREF(writer->buffer);
13821
3.16M
            writer->readonly = 0;
13822
3.16M
        }
13823
10.4M
        else {
13824
10.4M
            newbuffer = resize_compact(writer->buffer, newlen);
13825
10.4M
            if (newbuffer == NULL)
13826
0
                return -1;
13827
10.4M
        }
13828
13.5M
        writer->buffer = newbuffer;
13829
13.5M
    }
13830
2.43M
    else if (maxchar > writer->maxchar) {
13831
2.43M
        assert(!writer->readonly);
13832
2.43M
        newbuffer = PyUnicode_New(writer->size, maxchar);
13833
2.43M
        if (newbuffer == NULL)
13834
0
            return -1;
13835
2.43M
        _PyUnicode_FastCopyCharacters(newbuffer, 0,
13836
2.43M
                                      writer->buffer, 0, writer->pos);
13837
2.43M
        Py_SETREF(writer->buffer, newbuffer);
13838
2.43M
    }
13839
59.4M
    _PyUnicodeWriter_Update(writer);
13840
59.4M
    return 0;
13841
13842
59.4M
#undef OVERALLOCATE_FACTOR
13843
59.4M
}
13844
13845
int
13846
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13847
                                     int kind)
13848
133k
{
13849
133k
    Py_UCS4 maxchar;
13850
13851
    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13852
133k
    assert(writer->kind < kind);
13853
13854
133k
    switch (kind)
13855
133k
    {
13856
0
    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13857
133k
    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13858
0
    case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
13859
0
    default:
13860
0
        Py_UNREACHABLE();
13861
133k
    }
13862
13863
133k
    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13864
133k
}
13865
13866
static inline int
13867
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13868
253M
{
13869
253M
    assert(ch <= MAX_UNICODE);
13870
253M
    if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13871
0
        return -1;
13872
253M
    PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13873
253M
    writer->pos++;
13874
253M
    return 0;
13875
253M
}
13876
13877
int
13878
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13879
84.9M
{
13880
84.9M
    return _PyUnicodeWriter_WriteCharInline(writer, ch);
13881
84.9M
}
13882
13883
int
13884
PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
13885
65.0M
{
13886
65.0M
    if (ch > MAX_UNICODE) {
13887
0
        PyErr_SetString(PyExc_ValueError,
13888
0
                        "character must be in range(0x110000)");
13889
0
        return -1;
13890
0
    }
13891
13892
65.0M
    return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
13893
65.0M
}
13894
13895
int
13896
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13897
60.9M
{
13898
60.9M
    assert(PyUnicode_Check(str));
13899
13900
60.9M
    Py_UCS4 maxchar;
13901
60.9M
    Py_ssize_t len;
13902
13903
60.9M
    len = PyUnicode_GET_LENGTH(str);
13904
60.9M
    if (len == 0)
13905
21.7M
        return 0;
13906
39.2M
    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13907
39.2M
    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13908
21.4M
        if (writer->buffer == NULL && !writer->overallocate) {
13909
5.31k
            assert(_PyUnicode_CheckConsistency(str, 1));
13910
5.31k
            writer->readonly = 1;
13911
5.31k
            writer->buffer = Py_NewRef(str);
13912
5.31k
            _PyUnicodeWriter_Update(writer);
13913
5.31k
            writer->pos += len;
13914
5.31k
            return 0;
13915
5.31k
        }
13916
21.4M
        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13917
0
            return -1;
13918
21.4M
    }
13919
39.2M
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13920
39.2M
                                  str, 0, len);
13921
39.2M
    writer->pos += len;
13922
39.2M
    return 0;
13923
39.2M
}
13924
13925
int
13926
PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
13927
4.00M
{
13928
4.00M
    PyTypeObject *type = Py_TYPE(obj);
13929
4.00M
    if (type == &PyUnicode_Type) {
13930
4.00M
        return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj);
13931
4.00M
    }
13932
13933
0
    if (type == &PyLong_Type) {
13934
0
        return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
13935
0
    }
13936
13937
0
    PyObject *str = PyObject_Str(obj);
13938
0
    if (str == NULL) {
13939
0
        return -1;
13940
0
    }
13941
13942
0
    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
13943
0
    Py_DECREF(str);
13944
0
    return res;
13945
0
}
13946
13947
13948
int
13949
PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj)
13950
8.23M
{
13951
8.23M
    if (Py_TYPE(obj) == &PyLong_Type) {
13952
1.32M
        return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
13953
1.32M
    }
13954
13955
6.90M
    PyObject *repr = PyObject_Repr(obj);
13956
6.90M
    if (repr == NULL) {
13957
0
        return -1;
13958
0
    }
13959
13960
6.90M
    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr);
13961
6.90M
    Py_DECREF(repr);
13962
6.90M
    return res;
13963
6.90M
}
13964
13965
13966
int
13967
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13968
                                Py_ssize_t start, Py_ssize_t end)
13969
62.3M
{
13970
62.3M
    assert(0 <= start);
13971
62.3M
    assert(end <= PyUnicode_GET_LENGTH(str));
13972
62.3M
    assert(start <= end);
13973
13974
62.3M
    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13975
98
        return _PyUnicodeWriter_WriteStr(writer, str);
13976
13977
62.3M
    Py_ssize_t len = end - start;
13978
62.3M
    if (len == 0) {
13979
0
        return 0;
13980
0
    }
13981
13982
62.3M
    Py_UCS4 maxchar;
13983
62.3M
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) {
13984
12.7M
        maxchar = _PyUnicode_FindMaxChar(str, start, end);
13985
12.7M
    }
13986
49.6M
    else {
13987
49.6M
        maxchar = writer->maxchar;
13988
49.6M
    }
13989
62.3M
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) {
13990
0
        return -1;
13991
0
    }
13992
13993
62.3M
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13994
62.3M
                                  str, start, len);
13995
62.3M
    writer->pos += len;
13996
62.3M
    return 0;
13997
62.3M
}
13998
13999
14000
int
14001
PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str,
14002
                               Py_ssize_t start, Py_ssize_t end)
14003
540k
{
14004
540k
    if (!PyUnicode_Check(str)) {
14005
0
        PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
14006
0
        return -1;
14007
0
    }
14008
540k
    if (start < 0 || start > end) {
14009
0
        PyErr_Format(PyExc_ValueError, "invalid start argument");
14010
0
        return -1;
14011
0
    }
14012
540k
    if (end > PyUnicode_GET_LENGTH(str)) {
14013
0
        PyErr_Format(PyExc_ValueError, "invalid end argument");
14014
0
        return -1;
14015
0
    }
14016
14017
540k
    return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
14018
540k
                                           start, end);
14019
540k
}
14020
14021
14022
int
14023
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14024
                                  const char *ascii, Py_ssize_t len)
14025
43.6M
{
14026
43.6M
    if (len == -1)
14027
0
        len = strlen(ascii);
14028
14029
43.6M
    assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
14030
14031
43.6M
    if (writer->buffer == NULL && !writer->overallocate) {
14032
8.57k
        PyObject *str;
14033
14034
8.57k
        str = _PyUnicode_FromASCII(ascii, len);
14035
8.57k
        if (str == NULL)
14036
0
            return -1;
14037
14038
8.57k
        writer->readonly = 1;
14039
8.57k
        writer->buffer = str;
14040
8.57k
        _PyUnicodeWriter_Update(writer);
14041
8.57k
        writer->pos += len;
14042
8.57k
        return 0;
14043
8.57k
    }
14044
14045
43.6M
    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14046
0
        return -1;
14047
14048
43.6M
    switch (writer->kind)
14049
43.6M
    {
14050
43.6M
    case PyUnicode_1BYTE_KIND:
14051
43.6M
    {
14052
43.6M
        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14053
43.6M
        Py_UCS1 *data = writer->data;
14054
14055
43.6M
        memcpy(data + writer->pos, str, len);
14056
43.6M
        break;
14057
0
    }
14058
9.26k
    case PyUnicode_2BYTE_KIND:
14059
9.26k
    {
14060
9.26k
        _PyUnicode_CONVERT_BYTES(
14061
9.26k
            Py_UCS1, Py_UCS2,
14062
9.26k
            ascii, ascii + len,
14063
9.26k
            (Py_UCS2 *)writer->data + writer->pos);
14064
9.26k
        break;
14065
0
    }
14066
4.19k
    case PyUnicode_4BYTE_KIND:
14067
4.19k
    {
14068
4.19k
        _PyUnicode_CONVERT_BYTES(
14069
4.19k
            Py_UCS1, Py_UCS4,
14070
4.19k
            ascii, ascii + len,
14071
4.19k
            (Py_UCS4 *)writer->data + writer->pos);
14072
4.19k
        break;
14073
0
    }
14074
0
    default:
14075
0
        Py_UNREACHABLE();
14076
43.6M
    }
14077
14078
43.6M
    writer->pos += len;
14079
43.6M
    return 0;
14080
43.6M
}
14081
14082
14083
int
14084
PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer,
14085
                           const char *str,
14086
                           Py_ssize_t size)
14087
438k
{
14088
438k
    assert(writer != NULL);
14089
438k
    _Py_AssertHoldsTstate();
14090
14091
438k
    _PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer;
14092
438k
    return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size);
14093
438k
}
14094
14095
14096
int
14097
PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
14098
                          const char *str,
14099
                          Py_ssize_t size)
14100
0
{
14101
0
    if (size < 0) {
14102
0
        size = strlen(str);
14103
0
    }
14104
14105
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
14106
0
    Py_ssize_t old_pos = _writer->pos;
14107
0
    int res = unicode_decode_utf8_writer(_writer, str, size,
14108
0
                                         _Py_ERROR_STRICT, NULL, NULL);
14109
0
    if (res < 0) {
14110
0
        _writer->pos = old_pos;
14111
0
    }
14112
0
    return res;
14113
0
}
14114
14115
14116
int
14117
PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
14118
                                   const char *string,
14119
                                   Py_ssize_t length,
14120
                                   const char *errors,
14121
                                   Py_ssize_t *consumed)
14122
0
{
14123
0
    if (length < 0) {
14124
0
        length = strlen(string);
14125
0
    }
14126
14127
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
14128
0
    Py_ssize_t old_pos = _writer->pos;
14129
0
    int res = unicode_decode_utf8_writer(_writer, string, length,
14130
0
                                         _Py_ERROR_UNKNOWN, errors, consumed);
14131
0
    if (res < 0) {
14132
0
        _writer->pos = old_pos;
14133
0
        if (consumed) {
14134
0
            *consumed = 0;
14135
0
        }
14136
0
    }
14137
0
    return res;
14138
0
}
14139
14140
14141
int
14142
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14143
                                   const char *str, Py_ssize_t len)
14144
0
{
14145
0
    Py_UCS4 maxchar;
14146
14147
0
    maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
14148
0
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14149
0
        return -1;
14150
0
    unicode_write_cstr(writer->buffer, writer->pos, str, len);
14151
0
    writer->pos += len;
14152
0
    return 0;
14153
0
}
14154
14155
PyObject *
14156
_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
14157
43.8M
{
14158
43.8M
    PyObject *str;
14159
14160
43.8M
    if (writer->pos == 0) {
14161
794
        Py_CLEAR(writer->buffer);
14162
794
        _Py_RETURN_UNICODE_EMPTY();
14163
794
    }
14164
14165
43.8M
    str = writer->buffer;
14166
43.8M
    writer->buffer = NULL;
14167
14168
43.8M
    if (writer->readonly) {
14169
12.9k
        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14170
12.9k
        return str;
14171
12.9k
    }
14172
14173
43.8M
    if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14174
43.0M
        PyObject *str2;
14175
43.0M
        str2 = resize_compact(str, writer->pos);
14176
43.0M
        if (str2 == NULL) {
14177
0
            Py_DECREF(str);
14178
0
            return NULL;
14179
0
        }
14180
43.0M
        str = str2;
14181
43.0M
    }
14182
14183
43.8M
    assert(_PyUnicode_CheckConsistency(str, 1));
14184
43.8M
    return unicode_result(str);
14185
43.8M
}
14186
14187
14188
PyObject*
14189
PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
14190
4.21M
{
14191
4.21M
    PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer);
14192
4.21M
    assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
14193
4.21M
    _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
14194
4.21M
    return str;
14195
4.21M
}
14196
14197
14198
void
14199
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14200
4.62M
{
14201
4.62M
    Py_CLEAR(writer->buffer);
14202
4.62M
}
14203
14204
#include "stringlib/unicode_format.h"
14205
14206
PyDoc_STRVAR(format__doc__,
14207
             "format($self, /, *args, **kwargs)\n\
14208
--\n\
14209
\n\
14210
Return a formatted version of the string, using substitutions from args and kwargs.\n\
14211
The substitutions are identified by braces ('{' and '}').");
14212
14213
PyDoc_STRVAR(format_map__doc__,
14214
             "format_map($self, mapping, /)\n\
14215
--\n\
14216
\n\
14217
Return a formatted version of the string, using substitutions from mapping.\n\
14218
The substitutions are identified by braces ('{' and '}').");
14219
14220
/*[clinic input]
14221
str.__format__ as unicode___format__
14222
14223
    format_spec: unicode
14224
    /
14225
14226
Return a formatted version of the string as described by format_spec.
14227
[clinic start generated code]*/
14228
14229
static PyObject *
14230
unicode___format___impl(PyObject *self, PyObject *format_spec)
14231
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
14232
0
{
14233
0
    _PyUnicodeWriter writer;
14234
0
    int ret;
14235
14236
0
    _PyUnicodeWriter_Init(&writer);
14237
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
14238
0
                                          self, format_spec, 0,
14239
0
                                          PyUnicode_GET_LENGTH(format_spec));
14240
0
    if (ret == -1) {
14241
0
        _PyUnicodeWriter_Dealloc(&writer);
14242
0
        return NULL;
14243
0
    }
14244
0
    return _PyUnicodeWriter_Finish(&writer);
14245
0
}
14246
14247
/*[clinic input]
14248
str.__sizeof__ as unicode_sizeof
14249
14250
Return the size of the string in memory, in bytes.
14251
[clinic start generated code]*/
14252
14253
static PyObject *
14254
unicode_sizeof_impl(PyObject *self)
14255
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
14256
0
{
14257
0
    Py_ssize_t size;
14258
14259
    /* If it's a compact object, account for base structure +
14260
       character data. */
14261
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
14262
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14263
0
    }
14264
0
    else if (PyUnicode_IS_COMPACT(self)) {
14265
0
        size = sizeof(PyCompactUnicodeObject) +
14266
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
14267
0
    }
14268
0
    else {
14269
        /* If it is a two-block object, account for base object, and
14270
           for character block if present. */
14271
0
        size = sizeof(PyUnicodeObject);
14272
0
        if (_PyUnicode_DATA_ANY(self))
14273
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
14274
0
                PyUnicode_KIND(self);
14275
0
    }
14276
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
14277
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
14278
14279
0
    return PyLong_FromSsize_t(size);
14280
0
}
14281
14282
static PyObject *
14283
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
14284
0
{
14285
0
    PyObject *copy = _PyUnicode_Copy(v);
14286
0
    if (!copy)
14287
0
        return NULL;
14288
0
    return Py_BuildValue("(N)", copy);
14289
0
}
14290
14291
/*
14292
This function searchs the longest common leading whitespace
14293
of all lines in the [src, end).
14294
It returns the length of the common leading whitespace and sets `output` to
14295
point to the beginning of the common leading whitespace if length > 0.
14296
*/
14297
static Py_ssize_t
14298
search_longest_common_leading_whitespace(
14299
    const char *const src,
14300
    const char *const end,
14301
    const char **output)
14302
0
{
14303
    // [_start, _start + _len)
14304
    // describes the current longest common leading whitespace
14305
0
    const char *_start = NULL;
14306
0
    Py_ssize_t _len = 0;
14307
14308
0
    for (const char *iter = src; iter < end; ++iter) {
14309
0
        const char *line_start = iter;
14310
0
        const char *leading_whitespace_end = NULL;
14311
14312
        // scan the whole line
14313
0
        while (iter < end && *iter != '\n') {
14314
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
14315
                /* `iter` points to the first non-whitespace character
14316
                   in this line */
14317
0
                if (iter == line_start) {
14318
                    // some line has no indent, fast exit!
14319
0
                    return 0;
14320
0
                }
14321
0
                leading_whitespace_end = iter;
14322
0
            }
14323
0
            ++iter;
14324
0
        }
14325
14326
        // if this line has all white space, skip it
14327
0
        if (!leading_whitespace_end) {
14328
0
            continue;
14329
0
        }
14330
14331
0
        if (!_start) {
14332
            // update the first leading whitespace
14333
0
            _start = line_start;
14334
0
            _len = leading_whitespace_end - line_start;
14335
0
            assert(_len > 0);
14336
0
        }
14337
0
        else {
14338
            /* We then compare with the current longest leading whitespace.
14339
14340
               [line_start, leading_whitespace_end) is the leading
14341
               whitespace of this line,
14342
14343
               [_start, _start + _len) is the leading whitespace of the
14344
               current longest leading whitespace. */
14345
0
            Py_ssize_t new_len = 0;
14346
0
            const char *_iter = _start, *line_iter = line_start;
14347
14348
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
14349
0
                   && *_iter == *line_iter)
14350
0
            {
14351
0
                ++_iter;
14352
0
                ++line_iter;
14353
0
                ++new_len;
14354
0
            }
14355
14356
0
            _len = new_len;
14357
0
            if (_len == 0) {
14358
                // No common things now, fast exit!
14359
0
                return 0;
14360
0
            }
14361
0
        }
14362
0
    }
14363
14364
0
    assert(_len >= 0);
14365
0
    if (_len > 0) {
14366
0
        *output = _start;
14367
0
    }
14368
0
    return _len;
14369
0
}
14370
14371
/* Dedent a string.
14372
   Behaviour is expected to be an exact match of `textwrap.dedent`.
14373
   Return a new reference on success, NULL with exception set on error.
14374
   */
14375
PyObject *
14376
_PyUnicode_Dedent(PyObject *unicode)
14377
0
{
14378
0
    Py_ssize_t src_len = 0;
14379
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
14380
0
    if (!src) {
14381
0
        return NULL;
14382
0
    }
14383
0
    assert(src_len >= 0);
14384
0
    if (src_len == 0) {
14385
0
        return Py_NewRef(unicode);
14386
0
    }
14387
14388
0
    const char *const end = src + src_len;
14389
14390
    // [whitespace_start, whitespace_start + whitespace_len)
14391
    // describes the current longest common leading whitespace
14392
0
    const char *whitespace_start = NULL;
14393
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
14394
0
        src, end, &whitespace_start);
14395
14396
0
    if (whitespace_len == 0) {
14397
0
        return Py_NewRef(unicode);
14398
0
    }
14399
14400
    // now we should trigger a dedent
14401
0
    char *dest = PyMem_Malloc(src_len);
14402
0
    if (!dest) {
14403
0
        PyErr_NoMemory();
14404
0
        return NULL;
14405
0
    }
14406
0
    char *dest_iter = dest;
14407
14408
0
    for (const char *iter = src; iter < end; ++iter) {
14409
0
        const char *line_start = iter;
14410
0
        bool in_leading_space = true;
14411
14412
        // iterate over a line to find the end of a line
14413
0
        while (iter < end && *iter != '\n') {
14414
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
14415
0
                in_leading_space = false;
14416
0
            }
14417
0
            ++iter;
14418
0
        }
14419
14420
        // invariant: *iter == '\n' or iter == end
14421
0
        bool append_newline = iter < end;
14422
14423
        // if this line has all white space, write '\n' and continue
14424
0
        if (in_leading_space && append_newline) {
14425
0
            *dest_iter++ = '\n';
14426
0
            continue;
14427
0
        }
14428
14429
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
14430
            conditionally append '\n' */
14431
14432
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
14433
0
        assert(new_line_len >= 0);
14434
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
14435
14436
0
        dest_iter += new_line_len;
14437
14438
0
        if (append_newline) {
14439
0
            *dest_iter++ = '\n';
14440
0
        }
14441
0
    }
14442
14443
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
14444
0
    PyMem_Free(dest);
14445
0
    return res;
14446
0
}
14447
14448
static PyMethodDef unicode_methods[] = {
14449
    UNICODE_ENCODE_METHODDEF
14450
    UNICODE_REPLACE_METHODDEF
14451
    UNICODE_SPLIT_METHODDEF
14452
    UNICODE_RSPLIT_METHODDEF
14453
    UNICODE_JOIN_METHODDEF
14454
    UNICODE_CAPITALIZE_METHODDEF
14455
    UNICODE_CASEFOLD_METHODDEF
14456
    UNICODE_TITLE_METHODDEF
14457
    UNICODE_CENTER_METHODDEF
14458
    UNICODE_COUNT_METHODDEF
14459
    UNICODE_EXPANDTABS_METHODDEF
14460
    UNICODE_FIND_METHODDEF
14461
    UNICODE_PARTITION_METHODDEF
14462
    UNICODE_INDEX_METHODDEF
14463
    UNICODE_LJUST_METHODDEF
14464
    UNICODE_LOWER_METHODDEF
14465
    UNICODE_LSTRIP_METHODDEF
14466
    UNICODE_RFIND_METHODDEF
14467
    UNICODE_RINDEX_METHODDEF
14468
    UNICODE_RJUST_METHODDEF
14469
    UNICODE_RSTRIP_METHODDEF
14470
    UNICODE_RPARTITION_METHODDEF
14471
    UNICODE_SPLITLINES_METHODDEF
14472
    UNICODE_STRIP_METHODDEF
14473
    UNICODE_SWAPCASE_METHODDEF
14474
    UNICODE_TRANSLATE_METHODDEF
14475
    UNICODE_UPPER_METHODDEF
14476
    UNICODE_STARTSWITH_METHODDEF
14477
    UNICODE_ENDSWITH_METHODDEF
14478
    UNICODE_REMOVEPREFIX_METHODDEF
14479
    UNICODE_REMOVESUFFIX_METHODDEF
14480
    UNICODE_ISASCII_METHODDEF
14481
    UNICODE_ISLOWER_METHODDEF
14482
    UNICODE_ISUPPER_METHODDEF
14483
    UNICODE_ISTITLE_METHODDEF
14484
    UNICODE_ISSPACE_METHODDEF
14485
    UNICODE_ISDECIMAL_METHODDEF
14486
    UNICODE_ISDIGIT_METHODDEF
14487
    UNICODE_ISNUMERIC_METHODDEF
14488
    UNICODE_ISALPHA_METHODDEF
14489
    UNICODE_ISALNUM_METHODDEF
14490
    UNICODE_ISIDENTIFIER_METHODDEF
14491
    UNICODE_ISPRINTABLE_METHODDEF
14492
    UNICODE_ZFILL_METHODDEF
14493
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
14494
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
14495
    UNICODE___FORMAT___METHODDEF
14496
    UNICODE_MAKETRANS_METHODDEF
14497
    UNICODE_SIZEOF_METHODDEF
14498
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
14499
    {NULL, NULL}
14500
};
14501
14502
static PyObject *
14503
unicode_mod(PyObject *v, PyObject *w)
14504
22.5M
{
14505
22.5M
    if (!PyUnicode_Check(v))
14506
0
        Py_RETURN_NOTIMPLEMENTED;
14507
22.5M
    return PyUnicode_Format(v, w);
14508
22.5M
}
14509
14510
static PyNumberMethods unicode_as_number = {
14511
    0,              /*nb_add*/
14512
    0,              /*nb_subtract*/
14513
    0,              /*nb_multiply*/
14514
    unicode_mod,            /*nb_remainder*/
14515
};
14516
14517
static PySequenceMethods unicode_as_sequence = {
14518
    unicode_length,     /* sq_length */
14519
    PyUnicode_Concat,   /* sq_concat */
14520
    unicode_repeat,     /* sq_repeat */
14521
    unicode_getitem,    /* sq_item */
14522
    0,                  /* sq_slice */
14523
    0,                  /* sq_ass_item */
14524
    0,                  /* sq_ass_slice */
14525
    PyUnicode_Contains, /* sq_contains */
14526
};
14527
14528
static PyObject*
14529
unicode_subscript(PyObject* self, PyObject* item)
14530
121M
{
14531
121M
    if (_PyIndex_Check(item)) {
14532
46.3M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14533
46.3M
        if (i == -1 && PyErr_Occurred())
14534
0
            return NULL;
14535
46.3M
        if (i < 0)
14536
56.9k
            i += PyUnicode_GET_LENGTH(self);
14537
46.3M
        return unicode_getitem(self, i);
14538
74.6M
    } else if (PySlice_Check(item)) {
14539
74.6M
        Py_ssize_t start, stop, step, slicelength, i;
14540
74.6M
        size_t cur;
14541
74.6M
        PyObject *result;
14542
74.6M
        const void *src_data;
14543
74.6M
        void *dest_data;
14544
74.6M
        int src_kind, dest_kind;
14545
74.6M
        Py_UCS4 ch, max_char, kind_limit;
14546
14547
74.6M
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14548
0
            return NULL;
14549
0
        }
14550
74.6M
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14551
74.6M
                                            &start, &stop, step);
14552
14553
74.6M
        if (slicelength <= 0) {
14554
13.9M
            _Py_RETURN_UNICODE_EMPTY();
14555
60.7M
        } else if (start == 0 && step == 1 &&
14556
60.7M
                   slicelength == PyUnicode_GET_LENGTH(self)) {
14557
5.57M
            return unicode_result_unchanged(self);
14558
55.1M
        } else if (step == 1) {
14559
55.1M
            return PyUnicode_Substring(self,
14560
55.1M
                                       start, start + slicelength);
14561
55.1M
        }
14562
        /* General case */
14563
0
        src_kind = PyUnicode_KIND(self);
14564
0
        src_data = PyUnicode_DATA(self);
14565
0
        if (!PyUnicode_IS_ASCII(self)) {
14566
0
            kind_limit = kind_maxchar_limit(src_kind);
14567
0
            max_char = 0;
14568
0
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14569
0
                ch = PyUnicode_READ(src_kind, src_data, cur);
14570
0
                if (ch > max_char) {
14571
0
                    max_char = ch;
14572
0
                    if (max_char >= kind_limit)
14573
0
                        break;
14574
0
                }
14575
0
            }
14576
0
        }
14577
0
        else
14578
0
            max_char = 127;
14579
0
        result = PyUnicode_New(slicelength, max_char);
14580
0
        if (result == NULL)
14581
0
            return NULL;
14582
0
        dest_kind = PyUnicode_KIND(result);
14583
0
        dest_data = PyUnicode_DATA(result);
14584
14585
0
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14586
0
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14587
0
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14588
0
        }
14589
0
        assert(_PyUnicode_CheckConsistency(result, 1));
14590
0
        return result;
14591
0
    } else {
14592
0
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
14593
0
                     Py_TYPE(item)->tp_name);
14594
0
        return NULL;
14595
0
    }
14596
121M
}
14597
14598
static PyMappingMethods unicode_as_mapping = {
14599
    unicode_length,     /* mp_length */
14600
    unicode_subscript,  /* mp_subscript */
14601
    0,                  /* mp_ass_subscript */
14602
};
14603
14604
14605
/* Helpers for PyUnicode_Format() */
14606
14607
struct unicode_formatter_t {
14608
    PyObject *args;
14609
    int args_owned;
14610
    Py_ssize_t arglen, argidx;
14611
    PyObject *dict;
14612
14613
    int fmtkind;
14614
    Py_ssize_t fmtcnt, fmtpos;
14615
    const void *fmtdata;
14616
    PyObject *fmtstr;
14617
14618
    _PyUnicodeWriter writer;
14619
};
14620
14621
struct unicode_format_arg_t {
14622
    Py_UCS4 ch;
14623
    int flags;
14624
    Py_ssize_t width;
14625
    int prec;
14626
    int sign;
14627
};
14628
14629
static PyObject *
14630
unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14631
44.8M
{
14632
44.8M
    Py_ssize_t argidx = ctx->argidx;
14633
14634
44.8M
    if (argidx < ctx->arglen) {
14635
44.8M
        ctx->argidx++;
14636
44.8M
        if (ctx->arglen < 0)
14637
16.8M
            return ctx->args;
14638
27.9M
        else
14639
27.9M
            return PyTuple_GetItem(ctx->args, argidx);
14640
44.8M
    }
14641
0
    PyErr_SetString(PyExc_TypeError,
14642
0
                    "not enough arguments for format string");
14643
0
    return NULL;
14644
44.8M
}
14645
14646
/* Returns a new reference to a PyUnicode object, or NULL on failure. */
14647
14648
/* Format a float into the writer if the writer is not NULL, or into *p_output
14649
   otherwise.
14650
14651
   Return 0 on success, raise an exception and return -1 on error. */
14652
static int
14653
formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14654
            PyObject **p_output,
14655
            _PyUnicodeWriter *writer)
14656
0
{
14657
0
    char *p;
14658
0
    double x;
14659
0
    Py_ssize_t len;
14660
0
    int prec;
14661
0
    int dtoa_flags = 0;
14662
14663
0
    x = PyFloat_AsDouble(v);
14664
0
    if (x == -1.0 && PyErr_Occurred())
14665
0
        return -1;
14666
14667
0
    prec = arg->prec;
14668
0
    if (prec < 0)
14669
0
        prec = 6;
14670
14671
0
    if (arg->flags & F_ALT)
14672
0
        dtoa_flags |= Py_DTSF_ALT;
14673
0
    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14674
0
    if (p == NULL)
14675
0
        return -1;
14676
0
    len = strlen(p);
14677
0
    if (writer) {
14678
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14679
0
            PyMem_Free(p);
14680
0
            return -1;
14681
0
        }
14682
0
    }
14683
0
    else
14684
0
        *p_output = _PyUnicode_FromASCII(p, len);
14685
0
    PyMem_Free(p);
14686
0
    return 0;
14687
0
}
14688
14689
/* formatlong() emulates the format codes d, u, o, x and X, and
14690
 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14691
 * Python's regular ints.
14692
 * Return value:  a new PyUnicodeObject*, or NULL if error.
14693
 *     The output string is of the form
14694
 *         "-"? ("0x" | "0X")? digit+
14695
 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14696
 *         set in flags.  The case of hex digits will be correct,
14697
 *     There will be at least prec digits, zero-filled on the left if
14698
 *         necessary to get that many.
14699
 * val          object to be converted
14700
 * flags        bitmask of format flags; only F_ALT is looked at
14701
 * prec         minimum number of digits; 0-fill on left if needed
14702
 * type         a character in [duoxX]; u acts the same as d
14703
 *
14704
 * CAUTION:  o, x and X conversions on regular ints can never
14705
 * produce a '-' sign, but can for Python's unbounded ints.
14706
 */
14707
PyObject *
14708
_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14709
1.53k
{
14710
1.53k
    PyObject *result = NULL;
14711
1.53k
    char *buf;
14712
1.53k
    Py_ssize_t i;
14713
1.53k
    int sign;           /* 1 if '-', else 0 */
14714
1.53k
    int len;            /* number of characters */
14715
1.53k
    Py_ssize_t llen;
14716
1.53k
    int numdigits;      /* len == numnondigits + numdigits */
14717
1.53k
    int numnondigits = 0;
14718
14719
    /* Avoid exceeding SSIZE_T_MAX */
14720
1.53k
    if (prec > INT_MAX-3) {
14721
0
        PyErr_SetString(PyExc_OverflowError,
14722
0
                        "precision too large");
14723
0
        return NULL;
14724
0
    }
14725
14726
1.53k
    assert(PyLong_Check(val));
14727
14728
1.53k
    switch (type) {
14729
0
    default:
14730
0
        Py_UNREACHABLE();
14731
0
    case 'd':
14732
0
    case 'i':
14733
0
    case 'u':
14734
        /* int and int subclasses should print numerically when a numeric */
14735
        /* format code is used (see issue18780) */
14736
0
        result = PyNumber_ToBase(val, 10);
14737
0
        break;
14738
0
    case 'o':
14739
0
        numnondigits = 2;
14740
0
        result = PyNumber_ToBase(val, 8);
14741
0
        break;
14742
0
    case 'x':
14743
1.53k
    case 'X':
14744
1.53k
        numnondigits = 2;
14745
1.53k
        result = PyNumber_ToBase(val, 16);
14746
1.53k
        break;
14747
1.53k
    }
14748
1.53k
    if (!result)
14749
0
        return NULL;
14750
14751
1.53k
    assert(unicode_modifiable(result));
14752
1.53k
    assert(PyUnicode_IS_ASCII(result));
14753
14754
    /* To modify the string in-place, there can only be one reference. */
14755
1.53k
    if (!_PyObject_IsUniquelyReferenced(result)) {
14756
0
        Py_DECREF(result);
14757
0
        PyErr_BadInternalCall();
14758
0
        return NULL;
14759
0
    }
14760
1.53k
    buf = PyUnicode_DATA(result);
14761
1.53k
    llen = PyUnicode_GET_LENGTH(result);
14762
1.53k
    if (llen > INT_MAX) {
14763
0
        Py_DECREF(result);
14764
0
        PyErr_SetString(PyExc_ValueError,
14765
0
                        "string too large in _PyUnicode_FormatLong");
14766
0
        return NULL;
14767
0
    }
14768
1.53k
    len = (int)llen;
14769
1.53k
    sign = buf[0] == '-';
14770
1.53k
    numnondigits += sign;
14771
1.53k
    numdigits = len - numnondigits;
14772
1.53k
    assert(numdigits > 0);
14773
14774
    /* Get rid of base marker unless F_ALT */
14775
1.53k
    if (((alt) == 0 &&
14776
1.53k
        (type == 'o' || type == 'x' || type == 'X'))) {
14777
1.53k
        assert(buf[sign] == '0');
14778
1.53k
        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14779
1.53k
               buf[sign+1] == 'o');
14780
1.53k
        numnondigits -= 2;
14781
1.53k
        buf += 2;
14782
1.53k
        len -= 2;
14783
1.53k
        if (sign)
14784
0
            buf[0] = '-';
14785
1.53k
        assert(len == numnondigits + numdigits);
14786
1.53k
        assert(numdigits > 0);
14787
1.53k
    }
14788
14789
    /* Fill with leading zeroes to meet minimum width. */
14790
1.53k
    if (prec > numdigits) {
14791
0
        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14792
0
                                numnondigits + prec);
14793
0
        char *b1;
14794
0
        if (!r1) {
14795
0
            Py_DECREF(result);
14796
0
            return NULL;
14797
0
        }
14798
0
        b1 = PyBytes_AS_STRING(r1);
14799
0
        for (i = 0; i < numnondigits; ++i)
14800
0
            *b1++ = *buf++;
14801
0
        for (i = 0; i < prec - numdigits; i++)
14802
0
            *b1++ = '0';
14803
0
        for (i = 0; i < numdigits; i++)
14804
0
            *b1++ = *buf++;
14805
0
        *b1 = '\0';
14806
0
        Py_SETREF(result, r1);
14807
0
        buf = PyBytes_AS_STRING(result);
14808
0
        len = numnondigits + prec;
14809
0
    }
14810
14811
    /* Fix up case for hex conversions. */
14812
1.53k
    if (type == 'X') {
14813
        /* Need to convert all lower case letters to upper case.
14814
           and need to convert 0x to 0X (and -0x to -0X). */
14815
4.51k
        for (i = 0; i < len; i++)
14816
2.97k
            if (buf[i] >= 'a' && buf[i] <= 'x')
14817
1.15k
                buf[i] -= 'a'-'A';
14818
1.53k
    }
14819
1.53k
    if (!PyUnicode_Check(result)
14820
1.53k
        || buf != PyUnicode_DATA(result)) {
14821
1.53k
        PyObject *unicode;
14822
1.53k
        unicode = _PyUnicode_FromASCII(buf, len);
14823
1.53k
        Py_SETREF(result, unicode);
14824
1.53k
    }
14825
0
    else if (len != PyUnicode_GET_LENGTH(result)) {
14826
0
        if (PyUnicode_Resize(&result, len) < 0)
14827
0
            Py_CLEAR(result);
14828
0
    }
14829
1.53k
    return result;
14830
1.53k
}
14831
14832
/* Format an integer or a float as an integer.
14833
 * Return 1 if the number has been formatted into the writer,
14834
 *        0 if the number has been formatted into *p_output
14835
 *       -1 and raise an exception on error */
14836
static int
14837
mainformatlong(PyObject *v,
14838
               struct unicode_format_arg_t *arg,
14839
               PyObject **p_output,
14840
               _PyUnicodeWriter *writer)
14841
11.2M
{
14842
11.2M
    PyObject *iobj, *res;
14843
11.2M
    char type = (char)arg->ch;
14844
14845
11.2M
    if (!PyNumber_Check(v))
14846
4.56M
        goto wrongtype;
14847
14848
    /* make sure number is a type of integer for o, x, and X */
14849
6.67M
    if (!PyLong_Check(v)) {
14850
0
        if (type == 'o' || type == 'x' || type == 'X') {
14851
0
            iobj = _PyNumber_Index(v);
14852
0
        }
14853
0
        else {
14854
0
            iobj = PyNumber_Long(v);
14855
0
        }
14856
0
        if (iobj == NULL ) {
14857
0
            if (PyErr_ExceptionMatches(PyExc_TypeError))
14858
0
                goto wrongtype;
14859
0
            return -1;
14860
0
        }
14861
0
        assert(PyLong_Check(iobj));
14862
0
    }
14863
6.67M
    else {
14864
6.67M
        iobj = Py_NewRef(v);
14865
6.67M
    }
14866
14867
6.67M
    if (PyLong_CheckExact(v)
14868
6.67M
        && arg->width == -1 && arg->prec == -1
14869
6.67M
        && !(arg->flags & (F_SIGN | F_BLANK))
14870
6.67M
        && type != 'X')
14871
6.67M
    {
14872
        /* Fast path */
14873
6.67M
        int alternate = arg->flags & F_ALT;
14874
6.67M
        int base;
14875
14876
6.67M
        switch(type)
14877
6.67M
        {
14878
0
            default:
14879
0
                Py_UNREACHABLE();
14880
6.67M
            case 'd':
14881
6.67M
            case 'i':
14882
6.67M
            case 'u':
14883
6.67M
                base = 10;
14884
6.67M
                break;
14885
0
            case 'o':
14886
0
                base = 8;
14887
0
                break;
14888
0
            case 'x':
14889
0
            case 'X':
14890
0
                base = 16;
14891
0
                break;
14892
6.67M
        }
14893
14894
6.67M
        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14895
0
            Py_DECREF(iobj);
14896
0
            return -1;
14897
0
        }
14898
6.67M
        Py_DECREF(iobj);
14899
6.67M
        return 1;
14900
6.67M
    }
14901
14902
1.53k
    res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14903
1.53k
    Py_DECREF(iobj);
14904
1.53k
    if (res == NULL)
14905
0
        return -1;
14906
1.53k
    *p_output = res;
14907
1.53k
    return 0;
14908
14909
4.56M
wrongtype:
14910
4.56M
    switch(type)
14911
4.56M
    {
14912
0
        case 'o':
14913
0
        case 'x':
14914
0
        case 'X':
14915
0
            PyErr_Format(PyExc_TypeError,
14916
0
                    "%%%c format: an integer is required, "
14917
0
                    "not %.200s",
14918
0
                    type, Py_TYPE(v)->tp_name);
14919
0
            break;
14920
4.56M
        default:
14921
4.56M
            PyErr_Format(PyExc_TypeError,
14922
4.56M
                    "%%%c format: a real number is required, "
14923
4.56M
                    "not %.200s",
14924
4.56M
                    type, Py_TYPE(v)->tp_name);
14925
4.56M
            break;
14926
4.56M
    }
14927
4.56M
    return -1;
14928
4.56M
}
14929
14930
static Py_UCS4
14931
formatchar(PyObject *v)
14932
0
{
14933
    /* presume that the buffer is at least 3 characters long */
14934
0
    if (PyUnicode_Check(v)) {
14935
0
        if (PyUnicode_GET_LENGTH(v) == 1) {
14936
0
            return PyUnicode_READ_CHAR(v, 0);
14937
0
        }
14938
0
        PyErr_Format(PyExc_TypeError,
14939
0
                     "%%c requires an int or a unicode character, "
14940
0
                     "not a string of length %zd",
14941
0
                     PyUnicode_GET_LENGTH(v));
14942
0
        return (Py_UCS4) -1;
14943
0
    }
14944
0
    else {
14945
0
        int overflow;
14946
0
        long x = PyLong_AsLongAndOverflow(v, &overflow);
14947
0
        if (x == -1 && PyErr_Occurred()) {
14948
0
            if (PyErr_ExceptionMatches(PyExc_TypeError)) {
14949
0
                PyErr_Format(PyExc_TypeError,
14950
0
                             "%%c requires an int or a unicode character, not %T",
14951
0
                             v);
14952
0
                return (Py_UCS4) -1;
14953
0
            }
14954
0
            return (Py_UCS4) -1;
14955
0
        }
14956
14957
0
        if (x < 0 || x > MAX_UNICODE) {
14958
            /* this includes an overflow in converting to C long */
14959
0
            PyErr_SetString(PyExc_OverflowError,
14960
0
                            "%c arg not in range(0x110000)");
14961
0
            return (Py_UCS4) -1;
14962
0
        }
14963
14964
0
        return (Py_UCS4) x;
14965
0
    }
14966
0
}
14967
14968
/* Parse options of an argument: flags, width, precision.
14969
   Handle also "%(name)" syntax.
14970
14971
   Return 0 if the argument has been formatted into arg->str.
14972
   Return 1 if the argument has been written into ctx->writer,
14973
   Raise an exception and return -1 on error. */
14974
static int
14975
unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14976
                         struct unicode_format_arg_t *arg)
14977
44.8M
{
14978
44.8M
#define FORMAT_READ(ctx) \
14979
45.1M
        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14980
14981
44.8M
    PyObject *v;
14982
14983
44.8M
    if (arg->ch == '(') {
14984
        /* Get argument value from a dictionary. Example: "%(name)s". */
14985
37.0k
        Py_ssize_t keystart;
14986
37.0k
        Py_ssize_t keylen;
14987
37.0k
        PyObject *key;
14988
37.0k
        int pcount = 1;
14989
14990
37.0k
        if (ctx->dict == NULL) {
14991
0
            PyErr_SetString(PyExc_TypeError,
14992
0
                            "format requires a mapping");
14993
0
            return -1;
14994
0
        }
14995
37.0k
        ++ctx->fmtpos;
14996
37.0k
        --ctx->fmtcnt;
14997
37.0k
        keystart = ctx->fmtpos;
14998
        /* Skip over balanced parentheses */
14999
333k
        while (pcount > 0 && --ctx->fmtcnt >= 0) {
15000
296k
            arg->ch = FORMAT_READ(ctx);
15001
296k
            if (arg->ch == ')')
15002
37.0k
                --pcount;
15003
259k
            else if (arg->ch == '(')
15004
0
                ++pcount;
15005
296k
            ctx->fmtpos++;
15006
296k
        }
15007
37.0k
        keylen = ctx->fmtpos - keystart - 1;
15008
37.0k
        if (ctx->fmtcnt < 0 || pcount > 0) {
15009
0
            PyErr_SetString(PyExc_ValueError,
15010
0
                            "incomplete format key");
15011
0
            return -1;
15012
0
        }
15013
37.0k
        key = PyUnicode_Substring(ctx->fmtstr,
15014
37.0k
                                  keystart, keystart + keylen);
15015
37.0k
        if (key == NULL)
15016
0
            return -1;
15017
37.0k
        if (ctx->args_owned) {
15018
26.4k
            ctx->args_owned = 0;
15019
26.4k
            Py_DECREF(ctx->args);
15020
26.4k
        }
15021
37.0k
        ctx->args = PyObject_GetItem(ctx->dict, key);
15022
37.0k
        Py_DECREF(key);
15023
37.0k
        if (ctx->args == NULL)
15024
0
            return -1;
15025
37.0k
        ctx->args_owned = 1;
15026
37.0k
        ctx->arglen = -1;
15027
37.0k
        ctx->argidx = -2;
15028
37.0k
    }
15029
15030
    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
15031
44.8M
    while (--ctx->fmtcnt >= 0) {
15032
44.8M
        arg->ch = FORMAT_READ(ctx);
15033
44.8M
        ctx->fmtpos++;
15034
44.8M
        switch (arg->ch) {
15035
0
        case '-': arg->flags |= F_LJUST; continue;
15036
0
        case '+': arg->flags |= F_SIGN; continue;
15037
0
        case ' ': arg->flags |= F_BLANK; continue;
15038
0
        case '#': arg->flags |= F_ALT; continue;
15039
1.53k
        case '0': arg->flags |= F_ZERO; continue;
15040
44.8M
        }
15041
44.8M
        break;
15042
44.8M
    }
15043
15044
    /* Parse width. Example: "%10s" => width=10 */
15045
44.8M
    if (arg->ch == '*') {
15046
0
        v = unicode_format_getnextarg(ctx);
15047
0
        if (v == NULL)
15048
0
            return -1;
15049
0
        if (!PyLong_Check(v)) {
15050
0
            PyErr_SetString(PyExc_TypeError,
15051
0
                            "* wants int");
15052
0
            return -1;
15053
0
        }
15054
0
        arg->width = PyLong_AsSsize_t(v);
15055
0
        if (arg->width == -1 && PyErr_Occurred())
15056
0
            return -1;
15057
0
        if (arg->width < 0) {
15058
0
            arg->flags |= F_LJUST;
15059
0
            arg->width = -arg->width;
15060
0
        }
15061
0
        if (--ctx->fmtcnt >= 0) {
15062
0
            arg->ch = FORMAT_READ(ctx);
15063
0
            ctx->fmtpos++;
15064
0
        }
15065
0
    }
15066
44.8M
    else if (arg->ch >= '0' && arg->ch <= '9') {
15067
1.53k
        arg->width = arg->ch - '0';
15068
1.53k
        while (--ctx->fmtcnt >= 0) {
15069
1.53k
            arg->ch = FORMAT_READ(ctx);
15070
1.53k
            ctx->fmtpos++;
15071
1.53k
            if (arg->ch < '0' || arg->ch > '9')
15072
1.53k
                break;
15073
            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
15074
               mixing signed and unsigned comparison. Since arg->ch is between
15075
               '0' and '9', casting to int is safe. */
15076
0
            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
15077
0
                PyErr_SetString(PyExc_ValueError,
15078
0
                                "width too big");
15079
0
                return -1;
15080
0
            }
15081
0
            arg->width = arg->width*10 + (arg->ch - '0');
15082
0
        }
15083
1.53k
    }
15084
15085
    /* Parse precision. Example: "%.3f" => prec=3 */
15086
44.8M
    if (arg->ch == '.') {
15087
0
        arg->prec = 0;
15088
0
        if (--ctx->fmtcnt >= 0) {
15089
0
            arg->ch = FORMAT_READ(ctx);
15090
0
            ctx->fmtpos++;
15091
0
        }
15092
0
        if (arg->ch == '*') {
15093
0
            v = unicode_format_getnextarg(ctx);
15094
0
            if (v == NULL)
15095
0
                return -1;
15096
0
            if (!PyLong_Check(v)) {
15097
0
                PyErr_SetString(PyExc_TypeError,
15098
0
                                "* wants int");
15099
0
                return -1;
15100
0
            }
15101
0
            arg->prec = PyLong_AsInt(v);
15102
0
            if (arg->prec == -1 && PyErr_Occurred())
15103
0
                return -1;
15104
0
            if (arg->prec < 0)
15105
0
                arg->prec = 0;
15106
0
            if (--ctx->fmtcnt >= 0) {
15107
0
                arg->ch = FORMAT_READ(ctx);
15108
0
                ctx->fmtpos++;
15109
0
            }
15110
0
        }
15111
0
        else if (arg->ch >= '0' && arg->ch <= '9') {
15112
0
            arg->prec = arg->ch - '0';
15113
0
            while (--ctx->fmtcnt >= 0) {
15114
0
                arg->ch = FORMAT_READ(ctx);
15115
0
                ctx->fmtpos++;
15116
0
                if (arg->ch < '0' || arg->ch > '9')
15117
0
                    break;
15118
0
                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15119
0
                    PyErr_SetString(PyExc_ValueError,
15120
0
                                    "precision too big");
15121
0
                    return -1;
15122
0
                }
15123
0
                arg->prec = arg->prec*10 + (arg->ch - '0');
15124
0
            }
15125
0
        }
15126
0
    }
15127
15128
    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15129
44.8M
    if (ctx->fmtcnt >= 0) {
15130
44.8M
        if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15131
0
            if (--ctx->fmtcnt >= 0) {
15132
0
                arg->ch = FORMAT_READ(ctx);
15133
0
                ctx->fmtpos++;
15134
0
            }
15135
0
        }
15136
44.8M
    }
15137
44.8M
    if (ctx->fmtcnt < 0) {
15138
0
        PyErr_SetString(PyExc_ValueError,
15139
0
                        "incomplete format");
15140
0
        return -1;
15141
0
    }
15142
44.8M
    return 0;
15143
15144
44.8M
#undef FORMAT_READ
15145
44.8M
}
15146
15147
/* Format one argument. Supported conversion specifiers:
15148
15149
   - "s", "r", "a": any type
15150
   - "i", "d", "u": int or float
15151
   - "o", "x", "X": int
15152
   - "e", "E", "f", "F", "g", "G": float
15153
   - "c": int or str (1 character)
15154
15155
   When possible, the output is written directly into the Unicode writer
15156
   (ctx->writer). A string is created when padding is required.
15157
15158
   Return 0 if the argument has been formatted into *p_str,
15159
          1 if the argument has been written into ctx->writer,
15160
         -1 on error. */
15161
static int
15162
unicode_format_arg_format(struct unicode_formatter_t *ctx,
15163
                          struct unicode_format_arg_t *arg,
15164
                          PyObject **p_str)
15165
44.8M
{
15166
44.8M
    PyObject *v;
15167
44.8M
    _PyUnicodeWriter *writer = &ctx->writer;
15168
15169
44.8M
    if (ctx->fmtcnt == 0)
15170
11.1M
        ctx->writer.overallocate = 0;
15171
15172
44.8M
    v = unicode_format_getnextarg(ctx);
15173
44.8M
    if (v == NULL)
15174
0
        return -1;
15175
15176
15177
44.8M
    switch (arg->ch) {
15178
33.6M
    case 's':
15179
33.6M
    case 'r':
15180
33.6M
    case 'a':
15181
33.6M
        if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15182
            /* Fast path */
15183
0
            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15184
0
                return -1;
15185
0
            return 1;
15186
0
        }
15187
15188
33.6M
        if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15189
33.6M
            *p_str = Py_NewRef(v);
15190
33.6M
        }
15191
0
        else {
15192
0
            if (arg->ch == 's')
15193
0
                *p_str = PyObject_Str(v);
15194
0
            else if (arg->ch == 'r')
15195
0
                *p_str = PyObject_Repr(v);
15196
0
            else
15197
0
                *p_str = PyObject_ASCII(v);
15198
0
        }
15199
33.6M
        break;
15200
15201
0
    case 'i':
15202
11.2M
    case 'd':
15203
11.2M
    case 'u':
15204
11.2M
    case 'o':
15205
11.2M
    case 'x':
15206
11.2M
    case 'X':
15207
11.2M
    {
15208
11.2M
        int ret = mainformatlong(v, arg, p_str, writer);
15209
11.2M
        if (ret != 0)
15210
11.2M
            return ret;
15211
1.53k
        arg->sign = 1;
15212
1.53k
        break;
15213
11.2M
    }
15214
15215
0
    case 'e':
15216
0
    case 'E':
15217
0
    case 'f':
15218
0
    case 'F':
15219
0
    case 'g':
15220
0
    case 'G':
15221
0
        if (arg->width == -1 && arg->prec == -1
15222
0
            && !(arg->flags & (F_SIGN | F_BLANK)))
15223
0
        {
15224
            /* Fast path */
15225
0
            if (formatfloat(v, arg, NULL, writer) == -1)
15226
0
                return -1;
15227
0
            return 1;
15228
0
        }
15229
15230
0
        arg->sign = 1;
15231
0
        if (formatfloat(v, arg, p_str, NULL) == -1)
15232
0
            return -1;
15233
0
        break;
15234
15235
0
    case 'c':
15236
0
    {
15237
0
        Py_UCS4 ch = formatchar(v);
15238
0
        if (ch == (Py_UCS4) -1)
15239
0
            return -1;
15240
0
        if (arg->width == -1 && arg->prec == -1) {
15241
            /* Fast path */
15242
0
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
15243
0
                return -1;
15244
0
            return 1;
15245
0
        }
15246
0
        *p_str = PyUnicode_FromOrdinal(ch);
15247
0
        break;
15248
0
    }
15249
15250
0
    default:
15251
0
        PyErr_Format(PyExc_ValueError,
15252
0
                     "unsupported format character '%c' (0x%x) "
15253
0
                     "at index %zd",
15254
0
                     (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15255
0
                     (int)arg->ch,
15256
0
                     ctx->fmtpos - 1);
15257
0
        return -1;
15258
44.8M
    }
15259
33.6M
    if (*p_str == NULL)
15260
0
        return -1;
15261
33.6M
    assert (PyUnicode_Check(*p_str));
15262
33.6M
    return 0;
15263
33.6M
}
15264
15265
static int
15266
unicode_format_arg_output(struct unicode_formatter_t *ctx,
15267
                          struct unicode_format_arg_t *arg,
15268
                          PyObject *str)
15269
33.6M
{
15270
33.6M
    Py_ssize_t len;
15271
33.6M
    int kind;
15272
33.6M
    const void *pbuf;
15273
33.6M
    Py_ssize_t pindex;
15274
33.6M
    Py_UCS4 signchar;
15275
33.6M
    Py_ssize_t buflen;
15276
33.6M
    Py_UCS4 maxchar;
15277
33.6M
    Py_ssize_t sublen;
15278
33.6M
    _PyUnicodeWriter *writer = &ctx->writer;
15279
33.6M
    Py_UCS4 fill;
15280
15281
33.6M
    fill = ' ';
15282
33.6M
    if (arg->sign && arg->flags & F_ZERO)
15283
1.53k
        fill = '0';
15284
15285
33.6M
    len = PyUnicode_GET_LENGTH(str);
15286
33.6M
    if ((arg->width == -1 || arg->width <= len)
15287
33.6M
        && (arg->prec == -1 || arg->prec >= len)
15288
33.6M
        && !(arg->flags & (F_SIGN | F_BLANK)))
15289
33.6M
    {
15290
        /* Fast path */
15291
33.6M
        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15292
0
            return -1;
15293
33.6M
        return 0;
15294
33.6M
    }
15295
15296
    /* Truncate the string for "s", "r" and "a" formats
15297
       if the precision is set */
15298
96
    if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15299
0
        if (arg->prec >= 0 && len > arg->prec)
15300
0
            len = arg->prec;
15301
0
    }
15302
15303
    /* Adjust sign and width */
15304
96
    kind = PyUnicode_KIND(str);
15305
96
    pbuf = PyUnicode_DATA(str);
15306
96
    pindex = 0;
15307
96
    signchar = '\0';
15308
96
    if (arg->sign) {
15309
96
        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15310
96
        if (ch == '-' || ch == '+') {
15311
0
            signchar = ch;
15312
0
            len--;
15313
0
            pindex++;
15314
0
        }
15315
96
        else if (arg->flags & F_SIGN)
15316
0
            signchar = '+';
15317
96
        else if (arg->flags & F_BLANK)
15318
0
            signchar = ' ';
15319
96
        else
15320
96
            arg->sign = 0;
15321
96
    }
15322
96
    if (arg->width < len)
15323
0
        arg->width = len;
15324
15325
    /* Prepare the writer */
15326
96
    maxchar = writer->maxchar;
15327
96
    if (!(arg->flags & F_LJUST)) {
15328
96
        if (arg->sign) {
15329
0
            if ((arg->width-1) > len)
15330
0
                maxchar = Py_MAX(maxchar, fill);
15331
0
        }
15332
96
        else {
15333
96
            if (arg->width > len)
15334
96
                maxchar = Py_MAX(maxchar, fill);
15335
96
        }
15336
96
    }
15337
96
    if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15338
0
        Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
15339
0
        maxchar = Py_MAX(maxchar, strmaxchar);
15340
0
    }
15341
15342
96
    buflen = arg->width;
15343
96
    if (arg->sign && len == arg->width)
15344
0
        buflen++;
15345
96
    if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
15346
0
        return -1;
15347
15348
    /* Write the sign if needed */
15349
96
    if (arg->sign) {
15350
0
        if (fill != ' ') {
15351
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15352
0
            writer->pos += 1;
15353
0
        }
15354
0
        if (arg->width > len)
15355
0
            arg->width--;
15356
0
    }
15357
15358
    /* Write the numeric prefix for "x", "X" and "o" formats
15359
       if the alternate form is used.
15360
       For example, write "0x" for the "%#x" format. */
15361
96
    if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15362
0
        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15363
0
        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15364
0
        if (fill != ' ') {
15365
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15366
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15367
0
            writer->pos += 2;
15368
0
            pindex += 2;
15369
0
        }
15370
0
        arg->width -= 2;
15371
0
        if (arg->width < 0)
15372
0
            arg->width = 0;
15373
0
        len -= 2;
15374
0
    }
15375
15376
    /* Pad left with the fill character if needed */
15377
96
    if (arg->width > len && !(arg->flags & F_LJUST)) {
15378
96
        sublen = arg->width - len;
15379
96
        unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
15380
96
        writer->pos += sublen;
15381
96
        arg->width = len;
15382
96
    }
15383
15384
    /* If padding with spaces: write sign if needed and/or numeric prefix if
15385
       the alternate form is used */
15386
96
    if (fill == ' ') {
15387
0
        if (arg->sign) {
15388
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15389
0
            writer->pos += 1;
15390
0
        }
15391
0
        if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15392
0
            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15393
0
            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15394
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15395
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15396
0
            writer->pos += 2;
15397
0
            pindex += 2;
15398
0
        }
15399
0
    }
15400
15401
    /* Write characters */
15402
96
    if (len) {
15403
96
        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15404
96
                                      str, pindex, len);
15405
96
        writer->pos += len;
15406
96
    }
15407
15408
    /* Pad right with the fill character if needed */
15409
96
    if (arg->width > len) {
15410
0
        sublen = arg->width - len;
15411
0
        unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
15412
0
        writer->pos += sublen;
15413
0
    }
15414
96
    return 0;
15415
96
}
15416
15417
/* Helper of PyUnicode_Format(): format one arg.
15418
   Return 0 on success, raise an exception and return -1 on error. */
15419
static int
15420
unicode_format_arg(struct unicode_formatter_t *ctx)
15421
44.8M
{
15422
44.8M
    struct unicode_format_arg_t arg;
15423
44.8M
    PyObject *str;
15424
44.8M
    int ret;
15425
15426
44.8M
    arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
15427
44.8M
    if (arg.ch == '%') {
15428
0
        ctx->fmtpos++;
15429
0
        ctx->fmtcnt--;
15430
0
        if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15431
0
            return -1;
15432
0
        return 0;
15433
0
    }
15434
44.8M
    arg.flags = 0;
15435
44.8M
    arg.width = -1;
15436
44.8M
    arg.prec = -1;
15437
44.8M
    arg.sign = 0;
15438
44.8M
    str = NULL;
15439
15440
44.8M
    ret = unicode_format_arg_parse(ctx, &arg);
15441
44.8M
    if (ret == -1)
15442
0
        return -1;
15443
15444
44.8M
    ret = unicode_format_arg_format(ctx, &arg, &str);
15445
44.8M
    if (ret == -1)
15446
4.56M
        return -1;
15447
15448
40.3M
    if (ret != 1) {
15449
33.6M
        ret = unicode_format_arg_output(ctx, &arg, str);
15450
33.6M
        Py_DECREF(str);
15451
33.6M
        if (ret == -1)
15452
0
            return -1;
15453
33.6M
    }
15454
15455
40.3M
    if (ctx->dict && (ctx->argidx < ctx->arglen)) {
15456
0
        PyErr_SetString(PyExc_TypeError,
15457
0
                        "not all arguments converted during string formatting");
15458
0
        return -1;
15459
0
    }
15460
40.3M
    return 0;
15461
40.3M
}
15462
15463
PyObject *
15464
PyUnicode_Format(PyObject *format, PyObject *args)
15465
22.5M
{
15466
22.5M
    struct unicode_formatter_t ctx;
15467
15468
22.5M
    if (format == NULL || args == NULL) {
15469
0
        PyErr_BadInternalCall();
15470
0
        return NULL;
15471
0
    }
15472
15473
22.5M
    if (ensure_unicode(format) < 0)
15474
0
        return NULL;
15475
15476
22.5M
    ctx.fmtstr = format;
15477
22.5M
    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15478
22.5M
    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15479
22.5M
    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15480
22.5M
    ctx.fmtpos = 0;
15481
15482
22.5M
    _PyUnicodeWriter_Init(&ctx.writer);
15483
22.5M
    ctx.writer.min_length = ctx.fmtcnt + 100;
15484
22.5M
    ctx.writer.overallocate = 1;
15485
15486
22.5M
    if (PyTuple_Check(args)) {
15487
5.65M
        ctx.arglen = PyTuple_Size(args);
15488
5.65M
        ctx.argidx = 0;
15489
5.65M
    }
15490
16.8M
    else {
15491
16.8M
        ctx.arglen = -1;
15492
16.8M
        ctx.argidx = -2;
15493
16.8M
    }
15494
22.5M
    ctx.args_owned = 0;
15495
22.5M
    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
15496
10.5k
        ctx.dict = args;
15497
22.5M
    else
15498
22.5M
        ctx.dict = NULL;
15499
22.5M
    ctx.args = args;
15500
15501
107M
    while (--ctx.fmtcnt >= 0) {
15502
90.0M
        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15503
45.1M
            Py_ssize_t nonfmtpos;
15504
15505
45.1M
            nonfmtpos = ctx.fmtpos++;
15506
456M
            while (ctx.fmtcnt >= 0 &&
15507
456M
                   PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15508
411M
                ctx.fmtpos++;
15509
411M
                ctx.fmtcnt--;
15510
411M
            }
15511
45.1M
            if (ctx.fmtcnt < 0) {
15512
11.4M
                ctx.fmtpos--;
15513
11.4M
                ctx.writer.overallocate = 0;
15514
11.4M
            }
15515
15516
45.1M
            if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15517
45.1M
                                                nonfmtpos, ctx.fmtpos) < 0)
15518
0
                goto onError;
15519
45.1M
        }
15520
44.8M
        else {
15521
44.8M
            ctx.fmtpos++;
15522
44.8M
            if (unicode_format_arg(&ctx) == -1)
15523
4.56M
                goto onError;
15524
44.8M
        }
15525
90.0M
    }
15526
15527
17.9M
    if (ctx.argidx < ctx.arglen && !ctx.dict) {
15528
0
        PyErr_SetString(PyExc_TypeError,
15529
0
                        "not all arguments converted during string formatting");
15530
0
        goto onError;
15531
0
    }
15532
15533
17.9M
    if (ctx.args_owned) {
15534
10.5k
        Py_DECREF(ctx.args);
15535
10.5k
    }
15536
17.9M
    return _PyUnicodeWriter_Finish(&ctx.writer);
15537
15538
4.56M
  onError:
15539
4.56M
    _PyUnicodeWriter_Dealloc(&ctx.writer);
15540
4.56M
    if (ctx.args_owned) {
15541
0
        Py_DECREF(ctx.args);
15542
0
    }
15543
4.56M
    return NULL;
15544
17.9M
}
15545
15546
static PyObject *
15547
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15548
15549
/*[clinic input]
15550
@classmethod
15551
str.__new__ as unicode_new
15552
15553
    object as x: object = NULL
15554
    encoding: str = NULL
15555
    errors: str = NULL
15556
15557
[clinic start generated code]*/
15558
15559
static PyObject *
15560
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15561
                 const char *errors)
15562
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
15563
9.98M
{
15564
9.98M
    PyObject *unicode;
15565
9.98M
    if (x == NULL) {
15566
0
        unicode = unicode_get_empty();
15567
0
    }
15568
9.98M
    else if (encoding == NULL && errors == NULL) {
15569
9.98M
        unicode = PyObject_Str(x);
15570
9.98M
    }
15571
0
    else {
15572
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15573
0
    }
15574
15575
9.98M
    if (unicode != NULL && type != &PyUnicode_Type) {
15576
9.98M
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15577
9.98M
    }
15578
9.98M
    return unicode;
15579
9.98M
}
15580
15581
static const char *
15582
arg_as_utf8(PyObject *obj, const char *name)
15583
899k
{
15584
899k
    if (!PyUnicode_Check(obj)) {
15585
0
        PyErr_Format(PyExc_TypeError,
15586
0
                     "str() argument '%s' must be str, not %T",
15587
0
                     name, obj);
15588
0
        return NULL;
15589
0
    }
15590
899k
    return _PyUnicode_AsUTF8NoNUL(obj);
15591
899k
}
15592
15593
static PyObject *
15594
unicode_vectorcall(PyObject *type, PyObject *const *args,
15595
                   size_t nargsf, PyObject *kwnames)
15596
726k
{
15597
726k
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
15598
15599
726k
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
15600
726k
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
15601
        // Fallback to unicode_new()
15602
0
        PyObject *tuple = _PyTuple_FromArray(args, nargs);
15603
0
        if (tuple == NULL) {
15604
0
            return NULL;
15605
0
        }
15606
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
15607
0
        if (dict == NULL) {
15608
0
            Py_DECREF(tuple);
15609
0
            return NULL;
15610
0
        }
15611
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
15612
0
        Py_DECREF(tuple);
15613
0
        Py_DECREF(dict);
15614
0
        return ret;
15615
0
    }
15616
726k
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
15617
0
        return NULL;
15618
0
    }
15619
726k
    if (nargs == 0) {
15620
0
        return unicode_get_empty();
15621
0
    }
15622
726k
    PyObject *object = args[0];
15623
726k
    if (nargs == 1) {
15624
298
        return PyObject_Str(object);
15625
298
    }
15626
726k
    const char *encoding = arg_as_utf8(args[1], "encoding");
15627
726k
    if (encoding == NULL) {
15628
125
        return NULL;
15629
125
    }
15630
726k
    const char *errors = NULL;
15631
726k
    if (nargs == 3) {
15632
173k
        errors = arg_as_utf8(args[2], "errors");
15633
173k
        if (errors == NULL) {
15634
0
            return NULL;
15635
0
        }
15636
173k
    }
15637
726k
    return PyUnicode_FromEncodedObject(object, encoding, errors);
15638
726k
}
15639
15640
static PyObject *
15641
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
15642
9.98M
{
15643
9.98M
    PyObject *self;
15644
9.98M
    Py_ssize_t length, char_size;
15645
9.98M
    int share_utf8;
15646
9.98M
    int kind;
15647
9.98M
    void *data;
15648
15649
9.98M
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
15650
9.98M
    assert(_PyUnicode_CHECK(unicode));
15651
15652
9.98M
    self = type->tp_alloc(type, 0);
15653
9.98M
    if (self == NULL) {
15654
0
        return NULL;
15655
0
    }
15656
9.98M
    kind = PyUnicode_KIND(unicode);
15657
9.98M
    length = PyUnicode_GET_LENGTH(unicode);
15658
15659
9.98M
    _PyUnicode_LENGTH(self) = length;
15660
#ifdef Py_DEBUG
15661
    _PyUnicode_HASH(self) = -1;
15662
#else
15663
9.98M
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15664
9.98M
#endif
15665
9.98M
    _PyUnicode_STATE(self).interned = 0;
15666
9.98M
    _PyUnicode_STATE(self).kind = kind;
15667
9.98M
    _PyUnicode_STATE(self).compact = 0;
15668
9.98M
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15669
9.98M
    _PyUnicode_STATE(self).statically_allocated = 0;
15670
9.98M
    PyUnicode_SET_UTF8_LENGTH(self, 0);
15671
9.98M
    PyUnicode_SET_UTF8(self, NULL);
15672
9.98M
    _PyUnicode_DATA_ANY(self) = NULL;
15673
15674
9.98M
    share_utf8 = 0;
15675
9.98M
    if (kind == PyUnicode_1BYTE_KIND) {
15676
8.83M
        char_size = 1;
15677
8.83M
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15678
8.80M
            share_utf8 = 1;
15679
8.83M
    }
15680
1.14M
    else if (kind == PyUnicode_2BYTE_KIND) {
15681
1.09M
        char_size = 2;
15682
1.09M
    }
15683
53.1k
    else {
15684
53.1k
        assert(kind == PyUnicode_4BYTE_KIND);
15685
53.1k
        char_size = 4;
15686
53.1k
    }
15687
15688
    /* Ensure we won't overflow the length. */
15689
9.98M
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15690
0
        PyErr_NoMemory();
15691
0
        goto onError;
15692
0
    }
15693
9.98M
    data = PyMem_Malloc((length + 1) * char_size);
15694
9.98M
    if (data == NULL) {
15695
0
        PyErr_NoMemory();
15696
0
        goto onError;
15697
0
    }
15698
15699
9.98M
    _PyUnicode_DATA_ANY(self) = data;
15700
9.98M
    if (share_utf8) {
15701
8.80M
        PyUnicode_SET_UTF8_LENGTH(self, length);
15702
8.80M
        PyUnicode_SET_UTF8(self, data);
15703
8.80M
    }
15704
15705
9.98M
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
15706
9.98M
    assert(_PyUnicode_CheckConsistency(self, 1));
15707
#ifdef Py_DEBUG
15708
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15709
#endif
15710
9.98M
    return self;
15711
15712
0
onError:
15713
0
    Py_DECREF(self);
15714
0
    return NULL;
15715
9.98M
}
15716
15717
void
15718
_PyUnicode_ExactDealloc(PyObject *op)
15719
50.5M
{
15720
50.5M
    assert(PyUnicode_CheckExact(op));
15721
50.5M
    unicode_dealloc(op);
15722
50.5M
}
15723
15724
PyDoc_STRVAR(unicode_doc,
15725
"str(object='') -> str\n\
15726
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15727
\n\
15728
Create a new string object from the given object. If encoding or\n\
15729
errors is specified, then the object must expose a data buffer\n\
15730
that will be decoded using the given encoding and error handler.\n\
15731
Otherwise, returns the result of object.__str__() (if defined)\n\
15732
or repr(object).\n\
15733
encoding defaults to 'utf-8'.\n\
15734
errors defaults to 'strict'.");
15735
15736
static PyObject *unicode_iter(PyObject *seq);
15737
15738
PyTypeObject PyUnicode_Type = {
15739
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15740
    "str",                        /* tp_name */
15741
    sizeof(PyUnicodeObject),      /* tp_basicsize */
15742
    0,                            /* tp_itemsize */
15743
    /* Slots */
15744
    unicode_dealloc,              /* tp_dealloc */
15745
    0,                            /* tp_vectorcall_offset */
15746
    0,                            /* tp_getattr */
15747
    0,                            /* tp_setattr */
15748
    0,                            /* tp_as_async */
15749
    unicode_repr,                 /* tp_repr */
15750
    &unicode_as_number,           /* tp_as_number */
15751
    &unicode_as_sequence,         /* tp_as_sequence */
15752
    &unicode_as_mapping,          /* tp_as_mapping */
15753
    unicode_hash,                 /* tp_hash*/
15754
    0,                            /* tp_call*/
15755
    unicode_str,                  /* tp_str */
15756
    PyObject_GenericGetAttr,      /* tp_getattro */
15757
    0,                            /* tp_setattro */
15758
    0,                            /* tp_as_buffer */
15759
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15760
        Py_TPFLAGS_UNICODE_SUBCLASS |
15761
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
15762
    unicode_doc,                  /* tp_doc */
15763
    0,                            /* tp_traverse */
15764
    0,                            /* tp_clear */
15765
    PyUnicode_RichCompare,        /* tp_richcompare */
15766
    0,                            /* tp_weaklistoffset */
15767
    unicode_iter,                 /* tp_iter */
15768
    0,                            /* tp_iternext */
15769
    unicode_methods,              /* tp_methods */
15770
    0,                            /* tp_members */
15771
    0,                            /* tp_getset */
15772
    0,                            /* tp_base */
15773
    0,                            /* tp_dict */
15774
    0,                            /* tp_descr_get */
15775
    0,                            /* tp_descr_set */
15776
    0,                            /* tp_dictoffset */
15777
    0,                            /* tp_init */
15778
    0,                            /* tp_alloc */
15779
    unicode_new,                  /* tp_new */
15780
    PyObject_Free,                /* tp_free */
15781
    .tp_vectorcall = unicode_vectorcall,
15782
};
15783
15784
/* Initialize the Unicode implementation */
15785
15786
static void
15787
_init_global_state(void)
15788
16
{
15789
16
    static int initialized = 0;
15790
16
    if (initialized) {
15791
0
        return;
15792
0
    }
15793
16
    initialized = 1;
15794
15795
    /* initialize the linebreak bloom filter */
15796
16
    const Py_UCS2 linebreak[] = {
15797
16
        0x000A, /* LINE FEED */
15798
16
        0x000D, /* CARRIAGE RETURN */
15799
16
        0x001C, /* FILE SEPARATOR */
15800
16
        0x001D, /* GROUP SEPARATOR */
15801
16
        0x001E, /* RECORD SEPARATOR */
15802
16
        0x0085, /* NEXT LINE */
15803
16
        0x2028, /* LINE SEPARATOR */
15804
16
        0x2029, /* PARAGRAPH SEPARATOR */
15805
16
    };
15806
16
    bloom_linebreak = make_bloom_mask(
15807
16
        PyUnicode_2BYTE_KIND, linebreak,
15808
16
        Py_ARRAY_LENGTH(linebreak));
15809
16
}
15810
15811
void
15812
_PyUnicode_InitState(PyInterpreterState *interp)
15813
16
{
15814
16
    if (!_Py_IsMainInterpreter(interp)) {
15815
0
        return;
15816
0
    }
15817
16
    _init_global_state();
15818
16
}
15819
15820
15821
PyStatus
15822
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
15823
16
{
15824
16
    if (_Py_IsMainInterpreter(interp)) {
15825
16
        PyStatus status = init_global_interned_strings(interp);
15826
16
        if (_PyStatus_EXCEPTION(status)) {
15827
0
            return status;
15828
0
        }
15829
16
    }
15830
16
    assert(INTERNED_STRINGS);
15831
15832
16
    if (init_interned_dict(interp)) {
15833
0
        PyErr_Clear();
15834
0
        return _PyStatus_ERR("failed to create interned dict");
15835
0
    }
15836
15837
16
    return _PyStatus_OK();
15838
16
}
15839
15840
15841
PyStatus
15842
_PyUnicode_InitTypes(PyInterpreterState *interp)
15843
16
{
15844
16
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
15845
0
        goto error;
15846
0
    }
15847
16
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
15848
0
        goto error;
15849
0
    }
15850
16
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
15851
0
        goto error;
15852
0
    }
15853
16
    return _PyStatus_OK();
15854
15855
0
error:
15856
0
    return _PyStatus_ERR("Can't initialize unicode types");
15857
16
}
15858
15859
static /* non-null */ PyObject*
15860
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
15861
16.4k
{
15862
    // Note that this steals a reference to `s`, but in many cases that
15863
    // stolen ref is returned, requiring no decref/incref.
15864
15865
16.4k
    assert(s != NULL);
15866
16.4k
    assert(_PyUnicode_CHECK(s));
15867
16.4k
    assert(_PyUnicode_STATE(s).statically_allocated);
15868
16.4k
    assert(!PyUnicode_CHECK_INTERNED(s));
15869
15870
#ifdef Py_DEBUG
15871
    /* We must not add process-global interned string if there's already a
15872
     * per-interpreter interned_dict, which might contain duplicates.
15873
     */
15874
    PyObject *interned = get_interned_dict(interp);
15875
    assert(interned == NULL);
15876
#endif
15877
15878
    /* Look in the global cache first. */
15879
16.4k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
15880
    /* We should only init each string once */
15881
16.4k
    assert(r == NULL);
15882
    /* but just in case (for the non-debug build), handle this */
15883
16.4k
    if (r != NULL && r != s) {
15884
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
15885
0
        assert(_PyUnicode_CHECK(r));
15886
0
        Py_DECREF(s);
15887
0
        return Py_NewRef(r);
15888
0
    }
15889
15890
16.4k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
15891
0
        Py_FatalError("failed to intern static string");
15892
0
    }
15893
15894
16.4k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
15895
16.4k
    return s;
15896
16.4k
}
15897
15898
void
15899
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
15900
16.4k
{
15901
    // This should only be called as part of runtime initialization
15902
16.4k
    assert(!Py_IsInitialized());
15903
15904
16.4k
    *p = intern_static(interp, *p);
15905
16.4k
    assert(*p);
15906
16.4k
}
15907
15908
static void
15909
immortalize_interned(PyObject *s)
15910
91.4k
{
15911
91.4k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
15912
91.4k
    assert(!_Py_IsImmortal(s));
15913
#ifdef Py_REF_DEBUG
15914
    /* The reference count value should be excluded from the RefTotal.
15915
       The decrements to these objects will not be registered so they
15916
       need to be accounted for in here. */
15917
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
15918
        _Py_DecRefTotal(_PyThreadState_GET());
15919
    }
15920
#endif
15921
91.4k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
15922
91.4k
    _Py_SetImmortal(s);
15923
91.4k
}
15924
15925
static /* non-null */ PyObject*
15926
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
15927
              bool immortalize)
15928
32.7M
{
15929
    // Note that this steals a reference to `s`, but in many cases that
15930
    // stolen ref is returned, requiring no decref/incref.
15931
15932
#ifdef Py_DEBUG
15933
    assert(s != NULL);
15934
    assert(_PyUnicode_CHECK(s));
15935
#else
15936
32.7M
    if (s == NULL || !PyUnicode_Check(s)) {
15937
0
        return s;
15938
0
    }
15939
32.7M
#endif
15940
15941
    /* If it's a subclass, we don't really know what putting
15942
       it in the interned dict might do. */
15943
32.7M
    if (!PyUnicode_CheckExact(s)) {
15944
0
        return s;
15945
0
    }
15946
15947
    /* Is it already interned? */
15948
32.7M
    switch (PyUnicode_CHECK_INTERNED(s)) {
15949
2.85M
        case SSTATE_NOT_INTERNED:
15950
            // no, go on
15951
2.85M
            break;
15952
12.9k
        case SSTATE_INTERNED_MORTAL:
15953
            // yes but we might need to make it immortal
15954
12.9k
            if (immortalize) {
15955
50
                immortalize_interned(s);
15956
50
            }
15957
12.9k
            return s;
15958
29.9M
        default:
15959
            // all done
15960
29.9M
            return s;
15961
32.7M
    }
15962
15963
    /* Statically allocated strings must be already interned. */
15964
2.85M
    assert(!_PyUnicode_STATE(s).statically_allocated);
15965
15966
#if Py_GIL_DISABLED
15967
    /* In the free-threaded build, all interned strings are immortal */
15968
    immortalize = 1;
15969
#endif
15970
15971
    /* If it's already immortal, intern it as such */
15972
2.85M
    if (_Py_IsImmortal(s)) {
15973
0
        immortalize = 1;
15974
0
    }
15975
15976
    /* if it's a short string, get the singleton */
15977
2.85M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
15978
2.85M
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
15979
0
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
15980
0
        assert(PyUnicode_CHECK_INTERNED(r));
15981
0
        Py_DECREF(s);
15982
0
        return r;
15983
0
    }
15984
#ifdef Py_DEBUG
15985
    assert(!unicode_is_singleton(s));
15986
#endif
15987
15988
    /* Look in the global cache now. */
15989
2.85M
    {
15990
2.85M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
15991
2.85M
        if (r != NULL) {
15992
249k
            assert(_PyUnicode_STATE(r).statically_allocated);
15993
249k
            assert(r != s);  // r must be statically_allocated; s is not
15994
249k
            Py_DECREF(s);
15995
249k
            return Py_NewRef(r);
15996
249k
        }
15997
2.85M
    }
15998
15999
    /* Do a setdefault on the per-interpreter cache. */
16000
2.60M
    PyObject *interned = get_interned_dict(interp);
16001
2.60M
    assert(interned != NULL);
16002
16003
2.60M
    LOCK_INTERNED(interp);
16004
2.60M
    PyObject *t;
16005
2.60M
    {
16006
2.60M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
16007
2.60M
        if (res < 0) {
16008
0
            PyErr_Clear();
16009
0
            UNLOCK_INTERNED(interp);
16010
0
            return s;
16011
0
        }
16012
2.60M
        else if (res == 1) {
16013
            // value was already present (not inserted)
16014
2.04M
            Py_DECREF(s);
16015
2.04M
            if (immortalize &&
16016
2.04M
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
16017
4.10k
                immortalize_interned(t);
16018
4.10k
            }
16019
2.04M
            UNLOCK_INTERNED(interp);
16020
2.04M
            return t;
16021
2.04M
        }
16022
563k
        else {
16023
            // value was newly inserted
16024
563k
            assert (s == t);
16025
563k
            Py_DECREF(t);
16026
563k
        }
16027
2.60M
    }
16028
16029
    /* NOT_INTERNED -> INTERNED_MORTAL */
16030
16031
563k
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
16032
16033
563k
    if (!_Py_IsImmortal(s)) {
16034
        /* The two references in interned dict (key and value) are not counted.
16035
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
16036
563k
        Py_DECREF(s);
16037
563k
        Py_DECREF(s);
16038
563k
    }
16039
563k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
16040
16041
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
16042
16043
#ifdef Py_DEBUG
16044
    if (_Py_IsImmortal(s)) {
16045
        assert(immortalize);
16046
    }
16047
#endif
16048
563k
    if (immortalize) {
16049
87.2k
        immortalize_interned(s);
16050
87.2k
    }
16051
16052
563k
    UNLOCK_INTERNED(interp);
16053
563k
    return s;
16054
2.60M
}
16055
16056
void
16057
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
16058
2.84M
{
16059
2.84M
    *p = intern_common(interp, *p, 1);
16060
2.84M
    assert(*p);
16061
2.84M
}
16062
16063
void
16064
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
16065
29.9M
{
16066
29.9M
    *p = intern_common(interp, *p, 0);
16067
29.9M
    assert(*p);
16068
29.9M
}
16069
16070
16071
void
16072
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
16073
0
{
16074
0
    _PyUnicode_InternImmortal(interp, p);
16075
0
    return;
16076
0
}
16077
16078
void
16079
PyUnicode_InternInPlace(PyObject **p)
16080
0
{
16081
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
16082
0
    _PyUnicode_InternMortal(interp, p);
16083
0
}
16084
16085
// Public-looking name kept for the stable ABI; user should not call this:
16086
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
16087
void
16088
PyUnicode_InternImmortal(PyObject **p)
16089
0
{
16090
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
16091
0
    _PyUnicode_InternImmortal(interp, p);
16092
0
}
16093
16094
PyObject *
16095
PyUnicode_InternFromString(const char *cp)
16096
888k
{
16097
888k
    PyObject *s = PyUnicode_FromString(cp);
16098
888k
    if (s == NULL) {
16099
0
        return NULL;
16100
0
    }
16101
888k
    PyInterpreterState *interp = _PyInterpreterState_GET();
16102
888k
    _PyUnicode_InternMortal(interp, &s);
16103
888k
    return s;
16104
888k
}
16105
16106
16107
void
16108
_PyUnicode_ClearInterned(PyInterpreterState *interp)
16109
0
{
16110
0
    PyObject *interned = get_interned_dict(interp);
16111
0
    if (interned == NULL) {
16112
0
        return;
16113
0
    }
16114
0
    assert(PyDict_CheckExact(interned));
16115
16116
0
    if (has_shared_intern_dict(interp)) {
16117
        // the dict doesn't belong to this interpreter, skip the debug
16118
        // checks on it and just clear the pointer to it
16119
0
        clear_interned_dict(interp);
16120
0
        return;
16121
0
    }
16122
16123
#ifdef INTERNED_STATS
16124
    fprintf(stderr, "releasing %zd interned strings\n",
16125
            PyDict_GET_SIZE(interned));
16126
16127
    Py_ssize_t total_length = 0;
16128
#endif
16129
0
    Py_ssize_t pos = 0;
16130
0
    PyObject *s, *ignored_value;
16131
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
16132
0
        int shared = 0;
16133
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
16134
0
        case SSTATE_INTERNED_IMMORTAL:
16135
            /* Make immortal interned strings mortal again. */
16136
            // Skip the Immortal Instance check and restore
16137
            // the two references (key and value) ignored
16138
            // by PyUnicode_InternInPlace().
16139
0
            _Py_SetMortal(s, 2);
16140
#ifdef Py_REF_DEBUG
16141
            /* let's be pedantic with the ref total */
16142
            _Py_IncRefTotal(_PyThreadState_GET());
16143
            _Py_IncRefTotal(_PyThreadState_GET());
16144
#endif
16145
#ifdef INTERNED_STATS
16146
            total_length += PyUnicode_GET_LENGTH(s);
16147
#endif
16148
0
            break;
16149
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
16150
            /* It is shared between interpreters, so we should unmark it
16151
               only when this is the last interpreter in which it's
16152
               interned.  We immortalize all the statically initialized
16153
               strings during startup, so we can rely on the
16154
               main interpreter to be the last one. */
16155
0
            if (!_Py_IsMainInterpreter(interp)) {
16156
0
                shared = 1;
16157
0
            }
16158
0
            break;
16159
0
        case SSTATE_INTERNED_MORTAL:
16160
            // Restore 2 references held by the interned dict; these will
16161
            // be decref'd by clear_interned_dict's PyDict_Clear.
16162
0
            _Py_RefcntAdd(s, 2);
16163
#ifdef Py_REF_DEBUG
16164
            /* let's be pedantic with the ref total */
16165
            _Py_IncRefTotal(_PyThreadState_GET());
16166
            _Py_IncRefTotal(_PyThreadState_GET());
16167
#endif
16168
0
            break;
16169
0
        case SSTATE_NOT_INTERNED:
16170
0
            _Py_FALLTHROUGH;
16171
0
        default:
16172
0
            Py_UNREACHABLE();
16173
0
        }
16174
0
        if (!shared) {
16175
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
16176
0
        }
16177
0
    }
16178
#ifdef INTERNED_STATS
16179
    fprintf(stderr,
16180
            "total length of all interned strings: %zd characters\n",
16181
            total_length);
16182
#endif
16183
16184
0
    struct _Py_unicode_state *state = &interp->unicode;
16185
0
    struct _Py_unicode_ids *ids = &state->ids;
16186
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
16187
0
        Py_XINCREF(ids->array[i]);
16188
0
    }
16189
0
    clear_interned_dict(interp);
16190
0
    if (_Py_IsMainInterpreter(interp)) {
16191
0
        clear_global_interned_strings();
16192
0
    }
16193
0
}
16194
16195
16196
/********************* Unicode Iterator **************************/
16197
16198
typedef struct {
16199
    PyObject_HEAD
16200
    Py_ssize_t it_index;
16201
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
16202
} unicodeiterobject;
16203
16204
static void
16205
unicodeiter_dealloc(PyObject *op)
16206
1.42M
{
16207
1.42M
    unicodeiterobject *it = (unicodeiterobject *)op;
16208
1.42M
    _PyObject_GC_UNTRACK(it);
16209
1.42M
    Py_XDECREF(it->it_seq);
16210
1.42M
    PyObject_GC_Del(it);
16211
1.42M
}
16212
16213
static int
16214
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
16215
7
{
16216
7
    unicodeiterobject *it = (unicodeiterobject *)op;
16217
7
    Py_VISIT(it->it_seq);
16218
7
    return 0;
16219
7
}
16220
16221
static PyObject *
16222
unicodeiter_next(PyObject *op)
16223
107M
{
16224
107M
    unicodeiterobject *it = (unicodeiterobject *)op;
16225
107M
    PyObject *seq;
16226
16227
107M
    assert(it != NULL);
16228
107M
    seq = it->it_seq;
16229
107M
    if (seq == NULL)
16230
0
        return NULL;
16231
107M
    assert(_PyUnicode_CHECK(seq));
16232
16233
107M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
16234
107M
        int kind = PyUnicode_KIND(seq);
16235
107M
        const void *data = PyUnicode_DATA(seq);
16236
107M
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
16237
107M
        it->it_index++;
16238
107M
        return unicode_char(chr);
16239
107M
    }
16240
16241
702k
    it->it_seq = NULL;
16242
702k
    Py_DECREF(seq);
16243
702k
    return NULL;
16244
107M
}
16245
16246
static PyObject *
16247
unicode_ascii_iter_next(PyObject *op)
16248
98.7M
{
16249
98.7M
    unicodeiterobject *it = (unicodeiterobject *)op;
16250
98.7M
    assert(it != NULL);
16251
98.7M
    PyObject *seq = it->it_seq;
16252
98.7M
    if (seq == NULL) {
16253
0
        return NULL;
16254
0
    }
16255
98.7M
    assert(_PyUnicode_CHECK(seq));
16256
98.7M
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
16257
98.7M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
16258
98.0M
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
16259
98.0M
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
16260
98.0M
                                              data, it->it_index);
16261
98.0M
        it->it_index++;
16262
98.0M
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
16263
98.0M
    }
16264
660k
    it->it_seq = NULL;
16265
660k
    Py_DECREF(seq);
16266
660k
    return NULL;
16267
98.7M
}
16268
16269
static PyObject *
16270
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
16271
0
{
16272
0
    unicodeiterobject *it = (unicodeiterobject *)op;
16273
0
    Py_ssize_t len = 0;
16274
0
    if (it->it_seq)
16275
0
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
16276
0
    return PyLong_FromSsize_t(len);
16277
0
}
16278
16279
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
16280
16281
static PyObject *
16282
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
16283
0
{
16284
0
    unicodeiterobject *it = (unicodeiterobject *)op;
16285
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
16286
16287
    /* _PyEval_GetBuiltin can invoke arbitrary code,
16288
     * call must be before access of iterator pointers.
16289
     * see issue #101765 */
16290
16291
0
    if (it->it_seq != NULL) {
16292
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
16293
0
    } else {
16294
0
        PyObject *u = unicode_get_empty();
16295
0
        if (u == NULL) {
16296
0
            Py_XDECREF(iter);
16297
0
            return NULL;
16298
0
        }
16299
0
        return Py_BuildValue("N(N)", iter, u);
16300
0
    }
16301
0
}
16302
16303
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
16304
16305
static PyObject *
16306
unicodeiter_setstate(PyObject *op, PyObject *state)
16307
0
{
16308
0
    unicodeiterobject *it = (unicodeiterobject *)op;
16309
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
16310
0
    if (index == -1 && PyErr_Occurred())
16311
0
        return NULL;
16312
0
    if (it->it_seq != NULL) {
16313
0
        if (index < 0)
16314
0
            index = 0;
16315
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
16316
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
16317
0
        it->it_index = index;
16318
0
    }
16319
0
    Py_RETURN_NONE;
16320
0
}
16321
16322
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
16323
16324
static PyMethodDef unicodeiter_methods[] = {
16325
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
16326
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
16327
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
16328
    {NULL,      NULL}       /* sentinel */
16329
};
16330
16331
PyTypeObject PyUnicodeIter_Type = {
16332
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
16333
    "str_iterator",         /* tp_name */
16334
    sizeof(unicodeiterobject),      /* tp_basicsize */
16335
    0,                  /* tp_itemsize */
16336
    /* methods */
16337
    unicodeiter_dealloc,/* tp_dealloc */
16338
    0,                  /* tp_vectorcall_offset */
16339
    0,                  /* tp_getattr */
16340
    0,                  /* tp_setattr */
16341
    0,                  /* tp_as_async */
16342
    0,                  /* tp_repr */
16343
    0,                  /* tp_as_number */
16344
    0,                  /* tp_as_sequence */
16345
    0,                  /* tp_as_mapping */
16346
    0,                  /* tp_hash */
16347
    0,                  /* tp_call */
16348
    0,                  /* tp_str */
16349
    PyObject_GenericGetAttr,        /* tp_getattro */
16350
    0,                  /* tp_setattro */
16351
    0,                  /* tp_as_buffer */
16352
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
16353
    0,                  /* tp_doc */
16354
    unicodeiter_traverse, /* tp_traverse */
16355
    0,                  /* tp_clear */
16356
    0,                  /* tp_richcompare */
16357
    0,                  /* tp_weaklistoffset */
16358
    PyObject_SelfIter,          /* tp_iter */
16359
    unicodeiter_next,   /* tp_iternext */
16360
    unicodeiter_methods,            /* tp_methods */
16361
    0,
16362
};
16363
16364
PyTypeObject _PyUnicodeASCIIIter_Type = {
16365
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
16366
    .tp_name = "str_ascii_iterator",
16367
    .tp_basicsize = sizeof(unicodeiterobject),
16368
    .tp_dealloc = unicodeiter_dealloc,
16369
    .tp_getattro = PyObject_GenericGetAttr,
16370
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
16371
    .tp_traverse = unicodeiter_traverse,
16372
    .tp_iter = PyObject_SelfIter,
16373
    .tp_iternext = unicode_ascii_iter_next,
16374
    .tp_methods = unicodeiter_methods,
16375
};
16376
16377
static PyObject *
16378
unicode_iter(PyObject *seq)
16379
1.42M
{
16380
1.42M
    unicodeiterobject *it;
16381
16382
1.42M
    if (!PyUnicode_Check(seq)) {
16383
0
        PyErr_BadInternalCall();
16384
0
        return NULL;
16385
0
    }
16386
1.42M
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
16387
726k
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
16388
726k
    }
16389
702k
    else {
16390
702k
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16391
702k
    }
16392
1.42M
    if (it == NULL)
16393
0
        return NULL;
16394
1.42M
    it->it_index = 0;
16395
1.42M
    it->it_seq = Py_NewRef(seq);
16396
1.42M
    _PyObject_GC_TRACK(it);
16397
1.42M
    return (PyObject *)it;
16398
1.42M
}
16399
16400
static int
16401
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
16402
64
{
16403
64
    int res;
16404
64
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16405
64
    if (res == -2) {
16406
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
16407
0
        return -1;
16408
0
    }
16409
64
    if (res < 0) {
16410
0
        PyErr_NoMemory();
16411
0
        return -1;
16412
0
    }
16413
64
    return 0;
16414
64
}
16415
16416
16417
static int
16418
config_get_codec_name(wchar_t **config_encoding)
16419
32
{
16420
32
    char *encoding;
16421
32
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16422
0
        return -1;
16423
0
    }
16424
16425
32
    PyObject *name_obj = NULL;
16426
32
    PyObject *codec = _PyCodec_Lookup(encoding);
16427
32
    PyMem_RawFree(encoding);
16428
16429
32
    if (!codec)
16430
0
        goto error;
16431
16432
32
    name_obj = PyObject_GetAttrString(codec, "name");
16433
32
    Py_CLEAR(codec);
16434
32
    if (!name_obj) {
16435
0
        goto error;
16436
0
    }
16437
16438
32
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16439
32
    Py_DECREF(name_obj);
16440
32
    if (wname == NULL) {
16441
0
        goto error;
16442
0
    }
16443
16444
32
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16445
32
    if (raw_wname == NULL) {
16446
0
        PyMem_Free(wname);
16447
0
        PyErr_NoMemory();
16448
0
        goto error;
16449
0
    }
16450
16451
32
    PyMem_RawFree(*config_encoding);
16452
32
    *config_encoding = raw_wname;
16453
16454
32
    PyMem_Free(wname);
16455
32
    return 0;
16456
16457
0
error:
16458
0
    Py_XDECREF(codec);
16459
0
    Py_XDECREF(name_obj);
16460
0
    return -1;
16461
32
}
16462
16463
16464
static PyStatus
16465
init_stdio_encoding(PyInterpreterState *interp)
16466
16
{
16467
    /* Update the stdio encoding to the normalized Python codec name. */
16468
16
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16469
16
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
16470
0
        return _PyStatus_ERR("failed to get the Python codec name "
16471
0
                             "of the stdio encoding");
16472
0
    }
16473
16
    return _PyStatus_OK();
16474
16
}
16475
16476
16477
static int
16478
init_fs_codec(PyInterpreterState *interp)
16479
16
{
16480
16
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
16481
16482
16
    _Py_error_handler error_handler;
16483
16
    error_handler = get_error_handler_wide(config->filesystem_errors);
16484
16
    if (error_handler == _Py_ERROR_UNKNOWN) {
16485
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
16486
0
        return -1;
16487
0
    }
16488
16489
16
    char *encoding, *errors;
16490
16
    if (encode_wstr_utf8(config->filesystem_encoding,
16491
16
                         &encoding,
16492
16
                         "filesystem_encoding") < 0) {
16493
0
        return -1;
16494
0
    }
16495
16496
16
    if (encode_wstr_utf8(config->filesystem_errors,
16497
16
                         &errors,
16498
16
                         "filesystem_errors") < 0) {
16499
0
        PyMem_RawFree(encoding);
16500
0
        return -1;
16501
0
    }
16502
16503
16
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16504
16
    PyMem_RawFree(fs_codec->encoding);
16505
16
    fs_codec->encoding = encoding;
16506
    /* encoding has been normalized by init_fs_encoding() */
16507
16
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16508
16
    PyMem_RawFree(fs_codec->errors);
16509
16
    fs_codec->errors = errors;
16510
16
    fs_codec->error_handler = error_handler;
16511
16512
#ifdef _Py_FORCE_UTF8_FS_ENCODING
16513
    assert(fs_codec->utf8 == 1);
16514
#endif
16515
16516
    /* At this point, PyUnicode_EncodeFSDefault() and
16517
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16518
       the C implementation of the filesystem encoding. */
16519
16520
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16521
       global configuration variables. */
16522
16
    if (_Py_IsMainInterpreter(interp)) {
16523
16524
16
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16525
16
                                      fs_codec->errors) < 0) {
16526
0
            PyErr_NoMemory();
16527
0
            return -1;
16528
0
        }
16529
16
    }
16530
16
    return 0;
16531
16
}
16532
16533
16534
static PyStatus
16535
init_fs_encoding(PyThreadState *tstate)
16536
16
{
16537
16
    PyInterpreterState *interp = tstate->interp;
16538
16539
    /* Update the filesystem encoding to the normalized Python codec name.
16540
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16541
       (Python codec name). */
16542
16
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16543
16
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
16544
0
        _Py_DumpPathConfig(tstate);
16545
0
        return _PyStatus_ERR("failed to get the Python codec "
16546
0
                             "of the filesystem encoding");
16547
0
    }
16548
16549
16
    if (init_fs_codec(interp) < 0) {
16550
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
16551
0
    }
16552
16
    return _PyStatus_OK();
16553
16
}
16554
16555
16556
PyStatus
16557
_PyUnicode_InitEncodings(PyThreadState *tstate)
16558
16
{
16559
16
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
16560
16
    if (_PyStatus_EXCEPTION(status)) {
16561
0
        return status;
16562
0
    }
16563
16
    status = init_fs_encoding(tstate);
16564
16
    if (_PyStatus_EXCEPTION(status)) {
16565
0
        return status;
16566
0
    }
16567
16568
16
    return init_stdio_encoding(tstate->interp);
16569
16
}
16570
16571
16572
static void
16573
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
16574
0
{
16575
0
    PyMem_RawFree(fs_codec->encoding);
16576
0
    fs_codec->encoding = NULL;
16577
0
    fs_codec->utf8 = 0;
16578
0
    PyMem_RawFree(fs_codec->errors);
16579
0
    fs_codec->errors = NULL;
16580
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
16581
0
}
16582
16583
16584
#ifdef MS_WINDOWS
16585
int
16586
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16587
{
16588
    PyInterpreterState *interp = _PyInterpreterState_GET();
16589
    PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
16590
16591
    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16592
    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16593
    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16594
    if (encoding == NULL || errors == NULL) {
16595
        PyMem_RawFree(encoding);
16596
        PyMem_RawFree(errors);
16597
        PyErr_NoMemory();
16598
        return -1;
16599
    }
16600
16601
    PyMem_RawFree(config->filesystem_encoding);
16602
    config->filesystem_encoding = encoding;
16603
    PyMem_RawFree(config->filesystem_errors);
16604
    config->filesystem_errors = errors;
16605
16606
    return init_fs_codec(interp);
16607
}
16608
#endif
16609
16610
16611
#ifdef Py_DEBUG
16612
static inline int
16613
unicode_is_finalizing(void)
16614
{
16615
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
16616
}
16617
#endif
16618
16619
16620
void
16621
_PyUnicode_FiniTypes(PyInterpreterState *interp)
16622
0
{
16623
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
16624
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
16625
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
16626
0
}
16627
16628
16629
void
16630
_PyUnicode_Fini(PyInterpreterState *interp)
16631
0
{
16632
0
    struct _Py_unicode_state *state = &interp->unicode;
16633
16634
0
    if (!has_shared_intern_dict(interp)) {
16635
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
16636
0
        assert(get_interned_dict(interp) == NULL);
16637
0
    }
16638
16639
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
16640
16641
    // bpo-47182: force a unicodedata CAPI capsule re-import on
16642
    // subsequent initialization of interpreter.
16643
0
    interp->unicode.ucnhash_capi = NULL;
16644
16645
0
    unicode_clear_identifiers(state);
16646
0
}
16647
16648
/* A _string module, to export formatter_parser and formatter_field_name_split
16649
   to the string.Formatter class implemented in Python. */
16650
16651
static PyMethodDef _string_methods[] = {
16652
    {"formatter_field_name_split", formatter_field_name_split,
16653
     METH_O, PyDoc_STR("split the argument as a field name")},
16654
    {"formatter_parser", formatter_parser,
16655
     METH_O, PyDoc_STR("parse the argument as a format string")},
16656
    {NULL, NULL}
16657
};
16658
16659
static PyModuleDef_Slot module_slots[] = {
16660
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
16661
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
16662
    {0, NULL}
16663
};
16664
16665
static struct PyModuleDef _string_module = {
16666
    PyModuleDef_HEAD_INIT,
16667
    .m_name = "_string",
16668
    .m_doc = PyDoc_STR("string helper module"),
16669
    .m_size = 0,
16670
    .m_methods = _string_methods,
16671
    .m_slots = module_slots,
16672
};
16673
16674
PyMODINIT_FUNC
16675
PyInit__string(void)
16676
6
{
16677
6
    return PyModuleDef_Init(&_string_module);
16678
6
}
16679
16680
16681
#undef PyUnicode_KIND
16682
int PyUnicode_KIND(PyObject *op)
16683
0
{
16684
0
    if (!PyUnicode_Check(op)) {
16685
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
16686
0
        return -1;
16687
0
    }
16688
0
    return _PyASCIIObject_CAST(op)->state.kind;
16689
0
}
16690
16691
#undef PyUnicode_DATA
16692
void* PyUnicode_DATA(PyObject *op)
16693
0
{
16694
0
    if (!PyUnicode_Check(op)) {
16695
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
16696
0
        return NULL;
16697
0
    }
16698
0
    return _PyUnicode_DATA(op);
16699
0
}