Coverage Report

Created: 2025-09-05 07:10

/src/cpython/Objects/unicodeobject.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_Repeat()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_freelist.h"      // _Py_FREELIST_FREE(), _Py_FREELIST_POP()
50
#include "pycore_initconfig.h"    // _PyStatus_OK()
51
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
52
#include "pycore_long.h"          // _PyLong_FormatWriter()
53
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
54
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
55
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
56
#include "pycore_pyhash.h"        // _Py_HashSecret_t
57
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
58
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
59
#include "pycore_tuple.h"         // _PyTuple_FromArray()
60
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
61
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
62
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
63
64
#include "stringlib/eq.h"         // unicode_eq()
65
#include <stddef.h>               // ptrdiff_t
66
67
#ifdef MS_WINDOWS
68
#include <windows.h>
69
#endif
70
71
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
72
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
73
#endif
74
75
/* Uncomment to display statistics on interned strings at exit
76
   in _PyUnicode_ClearInterned(). */
77
/* #define INTERNED_STATS 1 */
78
79
80
/*[clinic input]
81
class str "PyObject *" "&PyUnicode_Type"
82
[clinic start generated code]*/
83
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
84
85
/*[python input]
86
class Py_UCS4_converter(CConverter):
87
    type = 'Py_UCS4'
88
    converter = 'convert_uc'
89
90
    def converter_init(self):
91
        if self.default is not unspecified:
92
            self.c_default = ascii(self.default)
93
            if len(self.c_default) > 4 or self.c_default[0] != "'":
94
                self.c_default = hex(ord(self.default))
95
96
[python start generated code]*/
97
/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
98
99
/* --- Globals ------------------------------------------------------------
100
101
NOTE: In the interpreter's initialization phase, some globals are currently
102
      initialized dynamically as needed. In the process Unicode objects may
103
      be created before the Unicode type is ready.
104
105
*/
106
107
// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
108
// The value must be the same in fileutils.c.
109
84.6M
#define MAX_UNICODE 0x10ffff
110
111
#ifdef Py_DEBUG
112
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
113
#else
114
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
115
#endif
116
117
static inline char* _PyUnicode_UTF8(PyObject *op)
118
255M
{
119
255M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
120
255M
}
121
122
static inline char* PyUnicode_UTF8(PyObject *op)
123
62.2M
{
124
62.2M
    assert(_PyUnicode_CHECK(op));
125
62.2M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
126
50.3M
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
127
50.3M
    }
128
11.9M
    else {
129
11.9M
         return _PyUnicode_UTF8(op);
130
11.9M
    }
131
62.2M
}
132
133
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
134
18.6M
{
135
18.6M
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
136
18.6M
}
137
138
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
139
28.1M
{
140
28.1M
    assert(_PyUnicode_CHECK(op));
141
28.1M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
142
25.1M
         return _PyASCIIObject_CAST(op)->length;
143
25.1M
    }
144
3.01M
    else {
145
3.01M
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
146
3.01M
    }
147
28.1M
}
148
149
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
150
18.6M
{
151
18.6M
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
152
18.6M
}
153
154
#define _PyUnicode_LENGTH(op)                           \
155
548M
    (_PyASCIIObject_CAST(op)->length)
156
#define _PyUnicode_STATE(op)                            \
157
3.44G
    (_PyASCIIObject_CAST(op)->state)
158
#define _PyUnicode_HASH(op)                             \
159
501M
    (_PyASCIIObject_CAST(op)->hash)
160
161
104M
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
162
163
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
164
45.7M
{
165
45.7M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
166
45.7M
}
167
168
#define _PyUnicode_DATA_ANY(op)                         \
169
39.2M
    (_PyUnicodeObject_CAST(op)->data.any)
170
171
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
172
0
{
173
0
    assert(_PyUnicode_CHECK(op));
174
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
175
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
176
0
}
177
178
/* true if the Unicode object has an allocated UTF-8 memory block
179
   (not shared with other data) */
180
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
181
548M
{
182
548M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
183
548M
            && _PyUnicode_UTF8(op) != NULL
184
548M
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
185
548M
}
186
187
188
/* Generic helper macro to convert characters of different types.
189
   from_type and to_type have to be valid type names, begin and end
190
   are pointers to the source characters which should be of type
191
   "from_type *".  to is a pointer of type "to_type *" and points to the
192
   buffer where the result characters are written to. */
193
#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
194
170M
    do {                                                \
195
170M
        to_type *_to = (to_type *)(to);                 \
196
170M
        const from_type *_iter = (const from_type *)(begin);\
197
170M
        const from_type *_end = (const from_type *)(end);\
198
170M
        Py_ssize_t n = (_end) - (_iter);                \
199
170M
        const from_type *_unrolled_end =                \
200
170M
            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
201
995M
        while (_iter < (_unrolled_end)) {               \
202
824M
            _to[0] = (to_type) _iter[0];                \
203
824M
            _to[1] = (to_type) _iter[1];                \
204
824M
            _to[2] = (to_type) _iter[2];                \
205
824M
            _to[3] = (to_type) _iter[3];                \
206
824M
            _iter += 4; _to += 4;                       \
207
824M
        }                                               \
208
387M
        while (_iter < (_end))                          \
209
216M
            *_to++ = (to_type) *_iter++;                \
210
170M
    } while (0)
211
212
243M
#define LATIN1 _Py_LATIN1_CHR
213
214
#ifdef MS_WINDOWS
215
   /* On Windows, overallocate by 50% is the best factor */
216
#  define OVERALLOCATE_FACTOR 2
217
#else
218
   /* On Linux, overallocate by 25% is the best factor */
219
99.3M
#  define OVERALLOCATE_FACTOR 4
220
#endif
221
222
/* Forward declaration */
223
static inline int
224
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
225
static inline void
226
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
227
static PyObject *
228
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
229
                    const char *errors);
230
static PyObject *
231
unicode_decode_utf8(const char *s, Py_ssize_t size,
232
                    _Py_error_handler error_handler, const char *errors,
233
                    Py_ssize_t *consumed);
234
static int
235
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
236
                           const char *s, Py_ssize_t size,
237
                           _Py_error_handler error_handler, const char *errors,
238
                           Py_ssize_t *consumed);
239
#ifdef Py_DEBUG
240
static inline int unicode_is_finalizing(void);
241
static int unicode_is_singleton(PyObject *unicode);
242
#endif
243
244
245
// Return a reference to the immortal empty string singleton.
246
static inline PyObject* unicode_get_empty(void)
247
96.5M
{
248
96.5M
    _Py_DECLARE_STR(empty, "");
249
96.5M
    return &_Py_STR(empty);
250
96.5M
}
251
252
/* This dictionary holds per-interpreter interned strings.
253
 * See InternalDocs/string_interning.md for details.
254
 */
255
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
256
3.30M
{
257
3.30M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
258
3.30M
}
259
260
/* This hashtable holds statically allocated interned strings.
261
 * See InternalDocs/string_interning.md for details.
262
 */
263
3.19M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
264
265
/* Get number of all interned strings for the current interpreter. */
266
Py_ssize_t
267
_PyUnicode_InternedSize(void)
268
0
{
269
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
270
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
271
0
}
272
273
/* Get number of immortal interned strings for the current interpreter. */
274
Py_ssize_t
275
_PyUnicode_InternedSize_Immortal(void)
276
0
{
277
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
278
0
    PyObject *key, *value;
279
0
    Py_ssize_t pos = 0;
280
0
    Py_ssize_t count = 0;
281
282
    // It's tempting to keep a count and avoid a loop here. But, this function
283
    // is intended for refleak tests. It spends extra work to report the true
284
    // value, to help detect bugs in optimizations.
285
286
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
287
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
288
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
289
0
           count++;
290
0
       }
291
0
    }
292
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
293
0
}
294
295
static Py_hash_t unicode_hash(PyObject *);
296
297
static Py_uhash_t
298
hashtable_unicode_hash(const void *key)
299
3.19M
{
300
3.19M
    return unicode_hash((PyObject *)key);
301
3.19M
}
302
303
static int
304
hashtable_unicode_compare(const void *key1, const void *key2)
305
278k
{
306
278k
    PyObject *obj1 = (PyObject *)key1;
307
278k
    PyObject *obj2 = (PyObject *)key2;
308
278k
    if (obj1 != NULL && obj2 != NULL) {
309
278k
        return unicode_eq(obj1, obj2);
310
278k
    }
311
0
    else {
312
0
        return obj1 == obj2;
313
0
    }
314
278k
}
315
316
/* Return true if this interpreter should share the main interpreter's
317
   intern_dict.  That's important for interpreters which load basic
318
   single-phase init extension modules (m_size == -1).  There could be interned
319
   immortal strings that are shared between interpreters, due to the
320
   PyDict_Update(mdict, m_copy) call in import_find_extension().
321
322
   It's not safe to deallocate those strings until all interpreters that
323
   potentially use them are freed.  By storing them in the main interpreter, we
324
   ensure they get freed after all other interpreters are freed.
325
*/
326
static bool
327
has_shared_intern_dict(PyInterpreterState *interp)
328
16
{
329
16
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
330
16
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
331
16
}
332
333
static int
334
init_interned_dict(PyInterpreterState *interp)
335
16
{
336
16
    assert(get_interned_dict(interp) == NULL);
337
16
    PyObject *interned;
338
16
    if (has_shared_intern_dict(interp)) {
339
0
        interned = get_interned_dict(_PyInterpreterState_Main());
340
0
        Py_INCREF(interned);
341
0
    }
342
16
    else {
343
16
        interned = PyDict_New();
344
16
        if (interned == NULL) {
345
0
            return -1;
346
0
        }
347
16
    }
348
16
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
349
16
    return 0;
350
16
}
351
352
static void
353
clear_interned_dict(PyInterpreterState *interp)
354
0
{
355
0
    PyObject *interned = get_interned_dict(interp);
356
0
    if (interned != NULL) {
357
0
        if (!has_shared_intern_dict(interp)) {
358
            // only clear if the dict belongs to this interpreter
359
0
            PyDict_Clear(interned);
360
0
        }
361
0
        Py_DECREF(interned);
362
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
363
0
    }
364
0
}
365
366
static PyStatus
367
init_global_interned_strings(PyInterpreterState *interp)
368
16
{
369
16
    assert(INTERNED_STRINGS == NULL);
370
16
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
371
372
16
    INTERNED_STRINGS = _Py_hashtable_new_full(
373
16
        hashtable_unicode_hash,
374
16
        hashtable_unicode_compare,
375
        // Objects stored here are immortal and statically allocated,
376
        // so we don't need key_destroy_func & value_destroy_func:
377
16
        NULL,
378
16
        NULL,
379
16
        &hashtable_alloc
380
16
    );
381
16
    if (INTERNED_STRINGS == NULL) {
382
0
        PyErr_Clear();
383
0
        return _PyStatus_ERR("failed to create global interned dict");
384
0
    }
385
386
    /* Intern statically allocated string identifiers, deepfreeze strings,
387
        * and one-byte latin-1 strings.
388
        * This must be done before any module initialization so that statically
389
        * allocated string identifiers are used instead of heap allocated strings.
390
        * Deepfreeze uses the interned identifiers if present to save space
391
        * else generates them and they are interned to speed up dict lookups.
392
    */
393
16
    _PyUnicode_InitStaticStrings(interp);
394
395
4.11k
    for (int i = 0; i < 256; i++) {
396
4.09k
        PyObject *s = LATIN1(i);
397
4.09k
        _PyUnicode_InternStatic(interp, &s);
398
4.09k
        assert(s == LATIN1(i));
399
4.09k
    }
400
#ifdef Py_DEBUG
401
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
402
403
    for (int i = 0; i < 256; i++) {
404
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
405
    }
406
#endif
407
16
    return _PyStatus_OK();
408
16
}
409
410
static void clear_global_interned_strings(void)
411
0
{
412
0
    if (INTERNED_STRINGS != NULL) {
413
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
414
0
        INTERNED_STRINGS = NULL;
415
0
    }
416
0
}
417
418
#define _Py_RETURN_UNICODE_EMPTY()   \
419
42.7M
    do {                             \
420
42.7M
        return unicode_get_empty();  \
421
42.7M
    } while (0)
422
423
static inline void
424
unicode_fill(int kind, void *data, Py_UCS4 value,
425
             Py_ssize_t start, Py_ssize_t length)
426
11.5M
{
427
11.5M
    assert(0 <= start);
428
11.5M
    switch (kind) {
429
3.27M
    case PyUnicode_1BYTE_KIND: {
430
3.27M
        assert(value <= 0xff);
431
3.27M
        Py_UCS1 ch = (unsigned char)value;
432
3.27M
        Py_UCS1 *to = (Py_UCS1 *)data + start;
433
3.27M
        memset(to, ch, length);
434
3.27M
        break;
435
0
    }
436
5.97M
    case PyUnicode_2BYTE_KIND: {
437
5.97M
        assert(value <= 0xffff);
438
5.97M
        Py_UCS2 ch = (Py_UCS2)value;
439
5.97M
        Py_UCS2 *to = (Py_UCS2 *)data + start;
440
5.97M
        const Py_UCS2 *end = to + length;
441
53.0M
        for (; to < end; ++to) *to = ch;
442
5.97M
        break;
443
0
    }
444
2.34M
    case PyUnicode_4BYTE_KIND: {
445
2.34M
        assert(value <= MAX_UNICODE);
446
2.34M
        Py_UCS4 ch = value;
447
2.34M
        Py_UCS4 * to = (Py_UCS4 *)data + start;
448
2.34M
        const Py_UCS4 *end = to + length;
449
20.7M
        for (; to < end; ++to) *to = ch;
450
2.34M
        break;
451
0
    }
452
0
    default: Py_UNREACHABLE();
453
11.5M
    }
454
11.5M
}
455
456
457
/* Fast detection of the most frequent whitespace characters */
458
const unsigned char _Py_ascii_whitespace[] = {
459
    0, 0, 0, 0, 0, 0, 0, 0,
460
/*     case 0x0009: * CHARACTER TABULATION */
461
/*     case 0x000A: * LINE FEED */
462
/*     case 0x000B: * LINE TABULATION */
463
/*     case 0x000C: * FORM FEED */
464
/*     case 0x000D: * CARRIAGE RETURN */
465
    0, 1, 1, 1, 1, 1, 0, 0,
466
    0, 0, 0, 0, 0, 0, 0, 0,
467
/*     case 0x001C: * FILE SEPARATOR */
468
/*     case 0x001D: * GROUP SEPARATOR */
469
/*     case 0x001E: * RECORD SEPARATOR */
470
/*     case 0x001F: * UNIT SEPARATOR */
471
    0, 0, 0, 0, 1, 1, 1, 1,
472
/*     case 0x0020: * SPACE */
473
    1, 0, 0, 0, 0, 0, 0, 0,
474
    0, 0, 0, 0, 0, 0, 0, 0,
475
    0, 0, 0, 0, 0, 0, 0, 0,
476
    0, 0, 0, 0, 0, 0, 0, 0,
477
478
    0, 0, 0, 0, 0, 0, 0, 0,
479
    0, 0, 0, 0, 0, 0, 0, 0,
480
    0, 0, 0, 0, 0, 0, 0, 0,
481
    0, 0, 0, 0, 0, 0, 0, 0,
482
    0, 0, 0, 0, 0, 0, 0, 0,
483
    0, 0, 0, 0, 0, 0, 0, 0,
484
    0, 0, 0, 0, 0, 0, 0, 0,
485
    0, 0, 0, 0, 0, 0, 0, 0
486
};
487
488
/* forward */
489
static PyObject* get_latin1_char(unsigned char ch);
490
static int unicode_modifiable(PyObject *unicode);
491
492
493
static PyObject *
494
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
495
static PyObject *
496
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
497
static PyObject *
498
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
499
500
static PyObject *
501
unicode_encode_call_errorhandler(const char *errors,
502
       PyObject **errorHandler,const char *encoding, const char *reason,
503
       PyObject *unicode, PyObject **exceptionObject,
504
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
505
506
static void
507
raise_encode_exception(PyObject **exceptionObject,
508
                       const char *encoding,
509
                       PyObject *unicode,
510
                       Py_ssize_t startpos, Py_ssize_t endpos,
511
                       const char *reason);
512
513
/* Same for linebreaks */
514
static const unsigned char ascii_linebreak[] = {
515
    0, 0, 0, 0, 0, 0, 0, 0,
516
/*         0x000A, * LINE FEED */
517
/*         0x000B, * LINE TABULATION */
518
/*         0x000C, * FORM FEED */
519
/*         0x000D, * CARRIAGE RETURN */
520
    0, 0, 1, 1, 1, 1, 0, 0,
521
    0, 0, 0, 0, 0, 0, 0, 0,
522
/*         0x001C, * FILE SEPARATOR */
523
/*         0x001D, * GROUP SEPARATOR */
524
/*         0x001E, * RECORD SEPARATOR */
525
    0, 0, 0, 0, 1, 1, 1, 0,
526
    0, 0, 0, 0, 0, 0, 0, 0,
527
    0, 0, 0, 0, 0, 0, 0, 0,
528
    0, 0, 0, 0, 0, 0, 0, 0,
529
    0, 0, 0, 0, 0, 0, 0, 0,
530
531
    0, 0, 0, 0, 0, 0, 0, 0,
532
    0, 0, 0, 0, 0, 0, 0, 0,
533
    0, 0, 0, 0, 0, 0, 0, 0,
534
    0, 0, 0, 0, 0, 0, 0, 0,
535
    0, 0, 0, 0, 0, 0, 0, 0,
536
    0, 0, 0, 0, 0, 0, 0, 0,
537
    0, 0, 0, 0, 0, 0, 0, 0,
538
    0, 0, 0, 0, 0, 0, 0, 0
539
};
540
541
static int convert_uc(PyObject *obj, void *addr);
542
543
struct encoding_map;
544
#include "clinic/unicodeobject.c.h"
545
546
_Py_error_handler
547
_Py_GetErrorHandler(const char *errors)
548
533k
{
549
533k
    if (errors == NULL || strcmp(errors, "strict") == 0) {
550
204k
        return _Py_ERROR_STRICT;
551
204k
    }
552
329k
    if (strcmp(errors, "surrogateescape") == 0) {
553
176k
        return _Py_ERROR_SURROGATEESCAPE;
554
176k
    }
555
152k
    if (strcmp(errors, "replace") == 0) {
556
152k
        return _Py_ERROR_REPLACE;
557
152k
    }
558
0
    if (strcmp(errors, "ignore") == 0) {
559
0
        return _Py_ERROR_IGNORE;
560
0
    }
561
0
    if (strcmp(errors, "backslashreplace") == 0) {
562
0
        return _Py_ERROR_BACKSLASHREPLACE;
563
0
    }
564
0
    if (strcmp(errors, "surrogatepass") == 0) {
565
0
        return _Py_ERROR_SURROGATEPASS;
566
0
    }
567
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
568
0
        return _Py_ERROR_XMLCHARREFREPLACE;
569
0
    }
570
0
    return _Py_ERROR_OTHER;
571
0
}
572
573
574
static _Py_error_handler
575
get_error_handler_wide(const wchar_t *errors)
576
5.57k
{
577
5.57k
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
578
0
        return _Py_ERROR_STRICT;
579
0
    }
580
5.57k
    if (wcscmp(errors, L"surrogateescape") == 0) {
581
5.57k
        return _Py_ERROR_SURROGATEESCAPE;
582
5.57k
    }
583
0
    if (wcscmp(errors, L"replace") == 0) {
584
0
        return _Py_ERROR_REPLACE;
585
0
    }
586
0
    if (wcscmp(errors, L"ignore") == 0) {
587
0
        return _Py_ERROR_IGNORE;
588
0
    }
589
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
590
0
        return _Py_ERROR_BACKSLASHREPLACE;
591
0
    }
592
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
593
0
        return _Py_ERROR_SURROGATEPASS;
594
0
    }
595
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
596
0
        return _Py_ERROR_XMLCHARREFREPLACE;
597
0
    }
598
0
    return _Py_ERROR_OTHER;
599
0
}
600
601
602
static inline int
603
unicode_check_encoding_errors(const char *encoding, const char *errors)
604
20.7M
{
605
20.7M
    if (encoding == NULL && errors == NULL) {
606
10.5M
        return 0;
607
10.5M
    }
608
609
10.2M
    PyInterpreterState *interp = _PyInterpreterState_GET();
610
10.2M
#ifndef Py_DEBUG
611
    /* In release mode, only check in development mode (-X dev) */
612
10.2M
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
613
10.2M
        return 0;
614
10.2M
    }
615
#else
616
    /* Always check in debug mode */
617
#endif
618
619
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
620
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
621
0
    if (!interp->unicode.fs_codec.encoding) {
622
0
        return 0;
623
0
    }
624
625
    /* Disable checks during Python finalization. For example, it allows to
626
       call _PyObject_Dump() during finalization for debugging purpose. */
627
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
628
0
        return 0;
629
0
    }
630
631
0
    if (encoding != NULL
632
        // Fast path for the most common built-in encodings. Even if the codec
633
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
634
        // create a temporary Unicode string (the key in the cache).
635
0
        && strcmp(encoding, "utf-8") != 0
636
0
        && strcmp(encoding, "utf8") != 0
637
0
        && strcmp(encoding, "ascii") != 0)
638
0
    {
639
0
        PyObject *handler = _PyCodec_Lookup(encoding);
640
0
        if (handler == NULL) {
641
0
            return -1;
642
0
        }
643
0
        Py_DECREF(handler);
644
0
    }
645
646
0
    if (errors != NULL
647
        // Fast path for the most common built-in error handlers.
648
0
        && strcmp(errors, "strict") != 0
649
0
        && strcmp(errors, "ignore") != 0
650
0
        && strcmp(errors, "replace") != 0
651
0
        && strcmp(errors, "surrogateescape") != 0
652
0
        && strcmp(errors, "surrogatepass") != 0)
653
0
    {
654
0
        PyObject *handler = PyCodec_LookupError(errors);
655
0
        if (handler == NULL) {
656
0
            return -1;
657
0
        }
658
0
        Py_DECREF(handler);
659
0
    }
660
0
    return 0;
661
0
}
662
663
664
int
665
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
666
0
{
667
0
#define CHECK(expr) \
668
0
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
669
670
0
    assert(op != NULL);
671
0
    CHECK(PyUnicode_Check(op));
672
673
0
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
674
0
    int kind = ascii->state.kind;
675
676
0
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
677
0
        CHECK(kind == PyUnicode_1BYTE_KIND);
678
0
    }
679
0
    else {
680
0
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
681
0
        void *data;
682
683
0
        if (ascii->state.compact == 1) {
684
0
            data = compact + 1;
685
0
            CHECK(kind == PyUnicode_1BYTE_KIND
686
0
                                 || kind == PyUnicode_2BYTE_KIND
687
0
                                 || kind == PyUnicode_4BYTE_KIND);
688
0
            CHECK(ascii->state.ascii == 0);
689
0
            CHECK(_PyUnicode_UTF8(op) != data);
690
0
        }
691
0
        else {
692
0
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
693
694
0
            data = unicode->data.any;
695
0
            CHECK(kind == PyUnicode_1BYTE_KIND
696
0
                     || kind == PyUnicode_2BYTE_KIND
697
0
                     || kind == PyUnicode_4BYTE_KIND);
698
0
            CHECK(ascii->state.compact == 0);
699
0
            CHECK(data != NULL);
700
0
            if (ascii->state.ascii) {
701
0
                CHECK(_PyUnicode_UTF8(op) == data);
702
0
                CHECK(compact->utf8_length == ascii->length);
703
0
            }
704
0
            else {
705
0
                CHECK(_PyUnicode_UTF8(op) != data);
706
0
            }
707
0
        }
708
0
#ifndef Py_GIL_DISABLED
709
0
        if (_PyUnicode_UTF8(op) == NULL)
710
0
            CHECK(compact->utf8_length == 0);
711
0
#endif
712
0
    }
713
714
    /* check that the best kind is used: O(n) operation */
715
0
    if (check_content) {
716
0
        Py_ssize_t i;
717
0
        Py_UCS4 maxchar = 0;
718
0
        const void *data;
719
0
        Py_UCS4 ch;
720
721
0
        data = PyUnicode_DATA(ascii);
722
0
        for (i=0; i < ascii->length; i++)
723
0
        {
724
0
            ch = PyUnicode_READ(kind, data, i);
725
0
            if (ch > maxchar)
726
0
                maxchar = ch;
727
0
        }
728
0
        if (kind == PyUnicode_1BYTE_KIND) {
729
0
            if (ascii->state.ascii == 0) {
730
0
                CHECK(maxchar >= 128);
731
0
                CHECK(maxchar <= 255);
732
0
            }
733
0
            else
734
0
                CHECK(maxchar < 128);
735
0
        }
736
0
        else if (kind == PyUnicode_2BYTE_KIND) {
737
0
            CHECK(maxchar >= 0x100);
738
0
            CHECK(maxchar <= 0xFFFF);
739
0
        }
740
0
        else {
741
0
            CHECK(maxchar >= 0x10000);
742
0
            CHECK(maxchar <= MAX_UNICODE);
743
0
        }
744
0
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
745
0
    }
746
747
    /* Check interning state */
748
#ifdef Py_DEBUG
749
    // Note that we do not check `_Py_IsImmortal(op)`, since stable ABI
750
    // extensions can make immortal strings mortal (but with a high enough
751
    // refcount).
752
    // The other way is extremely unlikely (worth a potential failed assertion
753
    // in a debug build), so we do check `!_Py_IsImmortal(op)`.
754
    switch (PyUnicode_CHECK_INTERNED(op)) {
755
        case SSTATE_NOT_INTERNED:
756
            if (ascii->state.statically_allocated) {
757
                // This state is for two exceptions:
758
                // - strings are currently checked before they're interned
759
                // - the 256 one-latin1-character strings
760
                //   are static but use SSTATE_NOT_INTERNED
761
            }
762
            else {
763
                CHECK(!_Py_IsImmortal(op));
764
            }
765
            break;
766
        case SSTATE_INTERNED_MORTAL:
767
            CHECK(!ascii->state.statically_allocated);
768
            CHECK(!_Py_IsImmortal(op));
769
            break;
770
        case SSTATE_INTERNED_IMMORTAL:
771
            CHECK(!ascii->state.statically_allocated);
772
            break;
773
        case SSTATE_INTERNED_IMMORTAL_STATIC:
774
            CHECK(ascii->state.statically_allocated);
775
            break;
776
        default:
777
            Py_UNREACHABLE();
778
    }
779
#endif
780
781
0
    return 1;
782
783
0
#undef CHECK
784
0
}
785
786
static PyObject*
787
unicode_result(PyObject *unicode)
788
47.2M
{
789
47.2M
    assert(_PyUnicode_CHECK(unicode));
790
791
47.2M
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
792
47.2M
    if (length == 0) {
793
229
        PyObject *empty = unicode_get_empty();
794
229
        if (unicode != empty) {
795
0
            Py_DECREF(unicode);
796
0
        }
797
229
        return empty;
798
229
    }
799
800
47.2M
    if (length == 1) {
801
263k
        int kind = PyUnicode_KIND(unicode);
802
263k
        if (kind == PyUnicode_1BYTE_KIND) {
803
87.0k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
804
87.0k
            Py_UCS1 ch = data[0];
805
87.0k
            PyObject *latin1_char = LATIN1(ch);
806
87.0k
            if (unicode != latin1_char) {
807
80.9k
                Py_DECREF(unicode);
808
80.9k
            }
809
87.0k
            return latin1_char;
810
87.0k
        }
811
263k
    }
812
813
47.1M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
814
47.1M
    return unicode;
815
47.2M
}
816
817
static PyObject*
818
unicode_result_unchanged(PyObject *unicode)
819
140M
{
820
140M
    if (PyUnicode_CheckExact(unicode)) {
821
137M
        return Py_NewRef(unicode);
822
137M
    }
823
2.92M
    else
824
        /* Subtype -- return genuine unicode string with the same value. */
825
2.92M
        return _PyUnicode_Copy(unicode);
826
140M
}
827
828
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
829
   ASCII, Latin1, UTF-8, etc. */
830
static char*
831
backslashreplace(_PyBytesWriter *writer, char *str,
832
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
833
0
{
834
0
    Py_ssize_t size, i;
835
0
    Py_UCS4 ch;
836
0
    int kind;
837
0
    const void *data;
838
839
0
    kind = PyUnicode_KIND(unicode);
840
0
    data = PyUnicode_DATA(unicode);
841
842
0
    size = 0;
843
    /* determine replacement size */
844
0
    for (i = collstart; i < collend; ++i) {
845
0
        Py_ssize_t incr;
846
847
0
        ch = PyUnicode_READ(kind, data, i);
848
0
        if (ch < 0x100)
849
0
            incr = 2+2;
850
0
        else if (ch < 0x10000)
851
0
            incr = 2+4;
852
0
        else {
853
0
            assert(ch <= MAX_UNICODE);
854
0
            incr = 2+8;
855
0
        }
856
0
        if (size > PY_SSIZE_T_MAX - incr) {
857
0
            PyErr_SetString(PyExc_OverflowError,
858
0
                            "encoded result is too long for a Python string");
859
0
            return NULL;
860
0
        }
861
0
        size += incr;
862
0
    }
863
864
0
    str = _PyBytesWriter_Prepare(writer, str, size);
865
0
    if (str == NULL)
866
0
        return NULL;
867
868
    /* generate replacement */
869
0
    for (i = collstart; i < collend; ++i) {
870
0
        ch = PyUnicode_READ(kind, data, i);
871
0
        *str++ = '\\';
872
0
        if (ch >= 0x00010000) {
873
0
            *str++ = 'U';
874
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
875
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
876
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
877
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
878
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
879
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
880
0
        }
881
0
        else if (ch >= 0x100) {
882
0
            *str++ = 'u';
883
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
884
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
885
0
        }
886
0
        else
887
0
            *str++ = 'x';
888
0
        *str++ = Py_hexdigits[(ch>>4)&0xf];
889
0
        *str++ = Py_hexdigits[ch&0xf];
890
0
    }
891
0
    return str;
892
0
}
893
894
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
895
   ASCII, Latin1, UTF-8, etc. */
896
static char*
897
xmlcharrefreplace(_PyBytesWriter *writer, char *str,
898
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
899
0
{
900
0
    Py_ssize_t size, i;
901
0
    Py_UCS4 ch;
902
0
    int kind;
903
0
    const void *data;
904
905
0
    kind = PyUnicode_KIND(unicode);
906
0
    data = PyUnicode_DATA(unicode);
907
908
0
    size = 0;
909
    /* determine replacement size */
910
0
    for (i = collstart; i < collend; ++i) {
911
0
        Py_ssize_t incr;
912
913
0
        ch = PyUnicode_READ(kind, data, i);
914
0
        if (ch < 10)
915
0
            incr = 2+1+1;
916
0
        else if (ch < 100)
917
0
            incr = 2+2+1;
918
0
        else if (ch < 1000)
919
0
            incr = 2+3+1;
920
0
        else if (ch < 10000)
921
0
            incr = 2+4+1;
922
0
        else if (ch < 100000)
923
0
            incr = 2+5+1;
924
0
        else if (ch < 1000000)
925
0
            incr = 2+6+1;
926
0
        else {
927
0
            assert(ch <= MAX_UNICODE);
928
0
            incr = 2+7+1;
929
0
        }
930
0
        if (size > PY_SSIZE_T_MAX - incr) {
931
0
            PyErr_SetString(PyExc_OverflowError,
932
0
                            "encoded result is too long for a Python string");
933
0
            return NULL;
934
0
        }
935
0
        size += incr;
936
0
    }
937
938
0
    str = _PyBytesWriter_Prepare(writer, str, size);
939
0
    if (str == NULL)
940
0
        return NULL;
941
942
    /* generate replacement */
943
0
    for (i = collstart; i < collend; ++i) {
944
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
945
0
        if (size < 0) {
946
0
            return NULL;
947
0
        }
948
0
        str += size;
949
0
    }
950
0
    return str;
951
0
}
952
953
/* --- Bloom Filters ----------------------------------------------------- */
954
955
/* stuff to implement simple "bloom filters" for Unicode characters.
956
   to keep things simple, we use a single bitmask, using the least 5
957
   bits from each unicode characters as the bit index. */
958
959
/* the linebreak mask is set up by _PyUnicode_Init() below */
960
961
#if LONG_BIT >= 128
962
#define BLOOM_WIDTH 128
963
#elif LONG_BIT >= 64
964
44.0M
#define BLOOM_WIDTH 64
965
#elif LONG_BIT >= 32
966
#define BLOOM_WIDTH 32
967
#else
968
#error "LONG_BIT is smaller than 32"
969
#endif
970
971
15.8M
#define BLOOM_MASK unsigned long
972
973
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
974
975
64.7M
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
976
977
#define BLOOM_LINEBREAK(ch)                                             \
978
236M
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
979
236M
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
980
981
static inline BLOOM_MASK
982
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
983
7.91M
{
984
7.91M
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
985
7.91M
    do {                                               \
986
7.91M
        TYPE *data = (TYPE *)PTR;                      \
987
7.91M
        TYPE *end = data + LEN;                        \
988
7.91M
        Py_UCS4 ch;                                    \
989
17.5M
        for (; data != end; data++) {                  \
990
9.66M
            ch = *data;                                \
991
9.66M
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
992
9.66M
        }                                              \
993
7.91M
        break;                                         \
994
7.91M
    } while (0)
995
996
    /* calculate simple bloom-style bitmask for a given unicode string */
997
998
7.91M
    BLOOM_MASK mask;
999
1000
7.91M
    mask = 0;
1001
7.91M
    switch (kind) {
1002
7.91M
    case PyUnicode_1BYTE_KIND:
1003
7.91M
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
1004
0
        break;
1005
16
    case PyUnicode_2BYTE_KIND:
1006
16
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
1007
0
        break;
1008
0
    case PyUnicode_4BYTE_KIND:
1009
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
1010
0
        break;
1011
0
    default:
1012
0
        Py_UNREACHABLE();
1013
7.91M
    }
1014
7.91M
    return mask;
1015
1016
7.91M
#undef BLOOM_UPDATE
1017
7.91M
}
1018
1019
static int
1020
ensure_unicode(PyObject *obj)
1021
156M
{
1022
156M
    if (!PyUnicode_Check(obj)) {
1023
0
        PyErr_Format(PyExc_TypeError,
1024
0
                     "must be str, not %.100s",
1025
0
                     Py_TYPE(obj)->tp_name);
1026
0
        return -1;
1027
0
    }
1028
156M
    return 0;
1029
156M
}
1030
1031
/* Compilation of templated routines */
1032
1033
1.05M
#define STRINGLIB_GET_EMPTY() unicode_get_empty()
1034
1035
#include "stringlib/asciilib.h"
1036
#include "stringlib/fastsearch.h"
1037
#include "stringlib/partition.h"
1038
#include "stringlib/split.h"
1039
#include "stringlib/count.h"
1040
#include "stringlib/find.h"
1041
#include "stringlib/find_max_char.h"
1042
#include "stringlib/undef.h"
1043
1044
#include "stringlib/ucs1lib.h"
1045
#include "stringlib/fastsearch.h"
1046
#include "stringlib/partition.h"
1047
#include "stringlib/split.h"
1048
#include "stringlib/count.h"
1049
#include "stringlib/find.h"
1050
#include "stringlib/replace.h"
1051
#include "stringlib/repr.h"
1052
#include "stringlib/find_max_char.h"
1053
#include "stringlib/undef.h"
1054
1055
#include "stringlib/ucs2lib.h"
1056
#include "stringlib/fastsearch.h"
1057
#include "stringlib/partition.h"
1058
#include "stringlib/split.h"
1059
#include "stringlib/count.h"
1060
#include "stringlib/find.h"
1061
#include "stringlib/replace.h"
1062
#include "stringlib/repr.h"
1063
#include "stringlib/find_max_char.h"
1064
#include "stringlib/undef.h"
1065
1066
#include "stringlib/ucs4lib.h"
1067
#include "stringlib/fastsearch.h"
1068
#include "stringlib/partition.h"
1069
#include "stringlib/split.h"
1070
#include "stringlib/count.h"
1071
#include "stringlib/find.h"
1072
#include "stringlib/replace.h"
1073
#include "stringlib/repr.h"
1074
#include "stringlib/find_max_char.h"
1075
#include "stringlib/undef.h"
1076
1077
#undef STRINGLIB_GET_EMPTY
1078
1079
/* --- Unicode Object ----------------------------------------------------- */
1080
1081
static inline Py_ssize_t
1082
findchar(const void *s, int kind,
1083
         Py_ssize_t size, Py_UCS4 ch,
1084
         int direction)
1085
106M
{
1086
106M
    switch (kind) {
1087
97.4M
    case PyUnicode_1BYTE_KIND:
1088
97.4M
        if ((Py_UCS1) ch != ch)
1089
3.67k
            return -1;
1090
97.4M
        if (direction > 0)
1091
97.4M
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1092
7.95k
        else
1093
7.95k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1094
7.94M
    case PyUnicode_2BYTE_KIND:
1095
7.94M
        if ((Py_UCS2) ch != ch)
1096
0
            return -1;
1097
7.94M
        if (direction > 0)
1098
7.91M
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1099
33.6k
        else
1100
33.6k
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1101
795k
    case PyUnicode_4BYTE_KIND:
1102
795k
        if (direction > 0)
1103
685k
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1104
110k
        else
1105
110k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1106
0
    default:
1107
0
        Py_UNREACHABLE();
1108
106M
    }
1109
106M
}
1110
1111
#ifdef Py_DEBUG
1112
/* Fill the data of a Unicode string with invalid characters to detect bugs
1113
   earlier.
1114
1115
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1116
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1117
   invalid character in Unicode 6.0. */
1118
static void
1119
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1120
{
1121
    int kind = PyUnicode_KIND(unicode);
1122
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1123
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1124
    if (length <= old_length)
1125
        return;
1126
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1127
}
1128
#endif
1129
1130
static PyObject*
1131
resize_copy(PyObject *unicode, Py_ssize_t length)
1132
0
{
1133
0
    Py_ssize_t copy_length;
1134
0
    PyObject *copy;
1135
1136
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1137
0
    if (copy == NULL)
1138
0
        return NULL;
1139
1140
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1141
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1142
0
    return copy;
1143
0
}
1144
1145
static PyObject*
1146
resize_compact(PyObject *unicode, Py_ssize_t length)
1147
57.2M
{
1148
57.2M
    Py_ssize_t char_size;
1149
57.2M
    Py_ssize_t struct_size;
1150
57.2M
    Py_ssize_t new_size;
1151
57.2M
    PyObject *new_unicode;
1152
#ifdef Py_DEBUG
1153
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1154
#endif
1155
1156
57.2M
    if (!unicode_modifiable(unicode)) {
1157
0
        PyObject *copy = resize_copy(unicode, length);
1158
0
        if (copy == NULL) {
1159
0
            return NULL;
1160
0
        }
1161
0
        Py_DECREF(unicode);
1162
0
        return copy;
1163
0
    }
1164
57.2M
    assert(PyUnicode_IS_COMPACT(unicode));
1165
1166
57.2M
    char_size = PyUnicode_KIND(unicode);
1167
57.2M
    if (PyUnicode_IS_ASCII(unicode))
1168
48.8M
        struct_size = sizeof(PyASCIIObject);
1169
8.38M
    else
1170
8.38M
        struct_size = sizeof(PyCompactUnicodeObject);
1171
1172
57.2M
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1173
0
        PyErr_NoMemory();
1174
0
        return NULL;
1175
0
    }
1176
57.2M
    new_size = (struct_size + (length + 1) * char_size);
1177
1178
57.2M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1179
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1180
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1181
0
        PyUnicode_SET_UTF8(unicode, NULL);
1182
0
    }
1183
#ifdef Py_TRACE_REFS
1184
    _Py_ForgetReference(unicode);
1185
#endif
1186
57.2M
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1187
1188
57.2M
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1189
57.2M
    if (new_unicode == NULL) {
1190
0
        _Py_NewReferenceNoTotal(unicode);
1191
0
        PyErr_NoMemory();
1192
0
        return NULL;
1193
0
    }
1194
57.2M
    unicode = new_unicode;
1195
57.2M
    _Py_NewReferenceNoTotal(unicode);
1196
1197
57.2M
    _PyUnicode_LENGTH(unicode) = length;
1198
#ifdef Py_DEBUG
1199
    unicode_fill_invalid(unicode, old_length);
1200
#endif
1201
57.2M
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1202
57.2M
                    length, 0);
1203
57.2M
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1204
57.2M
    return unicode;
1205
57.2M
}
1206
1207
static int
1208
resize_inplace(PyObject *unicode, Py_ssize_t length)
1209
0
{
1210
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1211
0
    assert(Py_REFCNT(unicode) == 1);
1212
1213
0
    Py_ssize_t new_size;
1214
0
    Py_ssize_t char_size;
1215
0
    int share_utf8;
1216
0
    void *data;
1217
#ifdef Py_DEBUG
1218
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1219
#endif
1220
1221
0
    data = _PyUnicode_DATA_ANY(unicode);
1222
0
    char_size = PyUnicode_KIND(unicode);
1223
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1224
1225
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1226
0
        PyErr_NoMemory();
1227
0
        return -1;
1228
0
    }
1229
0
    new_size = (length + 1) * char_size;
1230
1231
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1232
0
    {
1233
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1234
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1235
0
        PyUnicode_SET_UTF8(unicode, NULL);
1236
0
    }
1237
1238
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1239
0
    if (data == NULL) {
1240
0
        PyErr_NoMemory();
1241
0
        return -1;
1242
0
    }
1243
0
    _PyUnicode_DATA_ANY(unicode) = data;
1244
0
    if (share_utf8) {
1245
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1246
0
        PyUnicode_SET_UTF8(unicode, data);
1247
0
    }
1248
0
    _PyUnicode_LENGTH(unicode) = length;
1249
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1250
#ifdef Py_DEBUG
1251
    unicode_fill_invalid(unicode, old_length);
1252
#endif
1253
1254
    /* check for integer overflow */
1255
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1256
0
        PyErr_NoMemory();
1257
0
        return -1;
1258
0
    }
1259
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1260
0
    return 0;
1261
0
}
1262
1263
static const char*
1264
unicode_kind_name(PyObject *unicode)
1265
0
{
1266
    /* don't check consistency: unicode_kind_name() is called from
1267
       _PyUnicode_Dump() */
1268
0
    if (!PyUnicode_IS_COMPACT(unicode))
1269
0
    {
1270
0
        switch (PyUnicode_KIND(unicode))
1271
0
        {
1272
0
        case PyUnicode_1BYTE_KIND:
1273
0
            if (PyUnicode_IS_ASCII(unicode))
1274
0
                return "legacy ascii";
1275
0
            else
1276
0
                return "legacy latin1";
1277
0
        case PyUnicode_2BYTE_KIND:
1278
0
            return "legacy UCS2";
1279
0
        case PyUnicode_4BYTE_KIND:
1280
0
            return "legacy UCS4";
1281
0
        default:
1282
0
            return "<legacy invalid kind>";
1283
0
        }
1284
0
    }
1285
0
    switch (PyUnicode_KIND(unicode)) {
1286
0
    case PyUnicode_1BYTE_KIND:
1287
0
        if (PyUnicode_IS_ASCII(unicode))
1288
0
            return "ascii";
1289
0
        else
1290
0
            return "latin1";
1291
0
    case PyUnicode_2BYTE_KIND:
1292
0
        return "UCS2";
1293
0
    case PyUnicode_4BYTE_KIND:
1294
0
        return "UCS4";
1295
0
    default:
1296
0
        return "<invalid compact kind>";
1297
0
    }
1298
0
}
1299
1300
#ifdef Py_DEBUG
1301
/* Functions wrapping macros for use in debugger */
1302
const char *_PyUnicode_utf8(void *unicode_raw){
1303
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1304
    return PyUnicode_UTF8(unicode);
1305
}
1306
1307
const void *_PyUnicode_compact_data(void *unicode_raw) {
1308
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1309
    return _PyUnicode_COMPACT_DATA(unicode);
1310
}
1311
const void *_PyUnicode_data(void *unicode_raw) {
1312
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1313
    printf("obj %p\n", (void*)unicode);
1314
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1315
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1316
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1317
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1318
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1319
    return PyUnicode_DATA(unicode);
1320
}
1321
1322
void
1323
_PyUnicode_Dump(PyObject *op)
1324
{
1325
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1326
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1327
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1328
    const void *data;
1329
1330
    if (ascii->state.compact)
1331
    {
1332
        if (ascii->state.ascii)
1333
            data = (ascii + 1);
1334
        else
1335
            data = (compact + 1);
1336
    }
1337
    else
1338
        data = unicode->data.any;
1339
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1340
1341
    if (!ascii->state.ascii) {
1342
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1343
    }
1344
    printf(", data=%p\n", data);
1345
}
1346
#endif
1347
1348
1349
PyObject *
1350
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1351
504M
{
1352
    /* Optimization for empty strings */
1353
504M
    if (size == 0) {
1354
23.0M
        return unicode_get_empty();
1355
23.0M
    }
1356
1357
481M
    PyObject *obj;
1358
481M
    PyCompactUnicodeObject *unicode;
1359
481M
    void *data;
1360
481M
    int kind;
1361
481M
    int is_ascii;
1362
481M
    Py_ssize_t char_size;
1363
481M
    Py_ssize_t struct_size;
1364
1365
481M
    is_ascii = 0;
1366
481M
    struct_size = sizeof(PyCompactUnicodeObject);
1367
481M
    if (maxchar < 128) {
1368
265M
        kind = PyUnicode_1BYTE_KIND;
1369
265M
        char_size = 1;
1370
265M
        is_ascii = 1;
1371
265M
        struct_size = sizeof(PyASCIIObject);
1372
265M
    }
1373
216M
    else if (maxchar < 256) {
1374
24.6M
        kind = PyUnicode_1BYTE_KIND;
1375
24.6M
        char_size = 1;
1376
24.6M
    }
1377
191M
    else if (maxchar < 65536) {
1378
185M
        kind = PyUnicode_2BYTE_KIND;
1379
185M
        char_size = 2;
1380
185M
    }
1381
5.86M
    else {
1382
5.86M
        if (maxchar > MAX_UNICODE) {
1383
0
            PyErr_SetString(PyExc_SystemError,
1384
0
                            "invalid maximum character passed to PyUnicode_New");
1385
0
            return NULL;
1386
0
        }
1387
5.86M
        kind = PyUnicode_4BYTE_KIND;
1388
5.86M
        char_size = 4;
1389
5.86M
    }
1390
1391
    /* Ensure we won't overflow the size. */
1392
481M
    if (size < 0) {
1393
0
        PyErr_SetString(PyExc_SystemError,
1394
0
                        "Negative size passed to PyUnicode_New");
1395
0
        return NULL;
1396
0
    }
1397
481M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1398
0
        return PyErr_NoMemory();
1399
1400
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1401
     * PyObject_New() so we are able to allocate space for the object and
1402
     * it's data buffer.
1403
     */
1404
481M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1405
481M
    if (obj == NULL) {
1406
0
        return PyErr_NoMemory();
1407
0
    }
1408
481M
    _PyObject_Init(obj, &PyUnicode_Type);
1409
1410
481M
    unicode = (PyCompactUnicodeObject *)obj;
1411
481M
    if (is_ascii)
1412
265M
        data = ((PyASCIIObject*)obj) + 1;
1413
216M
    else
1414
216M
        data = unicode + 1;
1415
481M
    _PyUnicode_LENGTH(unicode) = size;
1416
481M
    _PyUnicode_HASH(unicode) = -1;
1417
481M
    _PyUnicode_STATE(unicode).interned = 0;
1418
481M
    _PyUnicode_STATE(unicode).kind = kind;
1419
481M
    _PyUnicode_STATE(unicode).compact = 1;
1420
481M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1421
481M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1422
481M
    if (is_ascii) {
1423
265M
        ((char*)data)[size] = 0;
1424
265M
    }
1425
216M
    else if (kind == PyUnicode_1BYTE_KIND) {
1426
24.6M
        ((char*)data)[size] = 0;
1427
24.6M
        unicode->utf8 = NULL;
1428
24.6M
        unicode->utf8_length = 0;
1429
24.6M
    }
1430
191M
    else {
1431
191M
        unicode->utf8 = NULL;
1432
191M
        unicode->utf8_length = 0;
1433
191M
        if (kind == PyUnicode_2BYTE_KIND)
1434
185M
            ((Py_UCS2*)data)[size] = 0;
1435
5.86M
        else /* kind == PyUnicode_4BYTE_KIND */
1436
5.86M
            ((Py_UCS4*)data)[size] = 0;
1437
191M
    }
1438
#ifdef Py_DEBUG
1439
    unicode_fill_invalid((PyObject*)unicode, 0);
1440
#endif
1441
481M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1442
481M
    return obj;
1443
481M
}
1444
1445
static int
1446
unicode_check_modifiable(PyObject *unicode)
1447
738
{
1448
738
    if (!unicode_modifiable(unicode)) {
1449
0
        PyErr_SetString(PyExc_SystemError,
1450
0
                        "Cannot modify a string currently used");
1451
0
        return -1;
1452
0
    }
1453
738
    return 0;
1454
738
}
1455
1456
static int
1457
_copy_characters(PyObject *to, Py_ssize_t to_start,
1458
                 PyObject *from, Py_ssize_t from_start,
1459
                 Py_ssize_t how_many, int check_maxchar)
1460
252M
{
1461
252M
    int from_kind, to_kind;
1462
252M
    const void *from_data;
1463
252M
    void *to_data;
1464
1465
252M
    assert(0 <= how_many);
1466
252M
    assert(0 <= from_start);
1467
252M
    assert(0 <= to_start);
1468
252M
    assert(PyUnicode_Check(from));
1469
252M
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1470
1471
252M
    assert(to == NULL || PyUnicode_Check(to));
1472
1473
252M
    if (how_many == 0) {
1474
260k
        return 0;
1475
260k
    }
1476
1477
252M
    assert(to != NULL);
1478
252M
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1479
1480
252M
    from_kind = PyUnicode_KIND(from);
1481
252M
    from_data = PyUnicode_DATA(from);
1482
252M
    to_kind = PyUnicode_KIND(to);
1483
252M
    to_data = PyUnicode_DATA(to);
1484
1485
#ifdef Py_DEBUG
1486
    if (!check_maxchar
1487
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1488
    {
1489
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1490
        Py_UCS4 ch;
1491
        Py_ssize_t i;
1492
        for (i=0; i < how_many; i++) {
1493
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1494
            assert(ch <= to_maxchar);
1495
        }
1496
    }
1497
#endif
1498
1499
252M
    if (from_kind == to_kind) {
1500
155M
        if (check_maxchar
1501
155M
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1502
0
        {
1503
            /* Writing Latin-1 characters into an ASCII string requires to
1504
               check that all written characters are pure ASCII */
1505
0
            Py_UCS4 max_char;
1506
0
            max_char = ucs1lib_find_max_char(from_data,
1507
0
                                             (const Py_UCS1*)from_data + how_many);
1508
0
            if (max_char >= 128)
1509
0
                return -1;
1510
0
        }
1511
155M
        memcpy((char*)to_data + to_kind * to_start,
1512
155M
                  (const char*)from_data + from_kind * from_start,
1513
155M
                  to_kind * how_many);
1514
155M
    }
1515
96.8M
    else if (from_kind == PyUnicode_1BYTE_KIND
1516
96.8M
             && to_kind == PyUnicode_2BYTE_KIND)
1517
79.5M
    {
1518
79.5M
        _PyUnicode_CONVERT_BYTES(
1519
79.5M
            Py_UCS1, Py_UCS2,
1520
79.5M
            PyUnicode_1BYTE_DATA(from) + from_start,
1521
79.5M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1522
79.5M
            PyUnicode_2BYTE_DATA(to) + to_start
1523
79.5M
            );
1524
79.5M
    }
1525
17.2M
    else if (from_kind == PyUnicode_1BYTE_KIND
1526
17.2M
             && to_kind == PyUnicode_4BYTE_KIND)
1527
15.1M
    {
1528
15.1M
        _PyUnicode_CONVERT_BYTES(
1529
15.1M
            Py_UCS1, Py_UCS4,
1530
15.1M
            PyUnicode_1BYTE_DATA(from) + from_start,
1531
15.1M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1532
15.1M
            PyUnicode_4BYTE_DATA(to) + to_start
1533
15.1M
            );
1534
15.1M
    }
1535
2.10M
    else if (from_kind == PyUnicode_2BYTE_KIND
1536
2.10M
             && to_kind == PyUnicode_4BYTE_KIND)
1537
2.08M
    {
1538
2.08M
        _PyUnicode_CONVERT_BYTES(
1539
2.08M
            Py_UCS2, Py_UCS4,
1540
2.08M
            PyUnicode_2BYTE_DATA(from) + from_start,
1541
2.08M
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1542
2.08M
            PyUnicode_4BYTE_DATA(to) + to_start
1543
2.08M
            );
1544
2.08M
    }
1545
20.5k
    else {
1546
20.5k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1547
1548
20.5k
        if (!check_maxchar) {
1549
20.5k
            if (from_kind == PyUnicode_2BYTE_KIND
1550
20.5k
                && to_kind == PyUnicode_1BYTE_KIND)
1551
2.27k
            {
1552
2.27k
                _PyUnicode_CONVERT_BYTES(
1553
2.27k
                    Py_UCS2, Py_UCS1,
1554
2.27k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1555
2.27k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1556
2.27k
                    PyUnicode_1BYTE_DATA(to) + to_start
1557
2.27k
                    );
1558
2.27k
            }
1559
18.2k
            else if (from_kind == PyUnicode_4BYTE_KIND
1560
18.2k
                     && to_kind == PyUnicode_1BYTE_KIND)
1561
8.85k
            {
1562
8.85k
                _PyUnicode_CONVERT_BYTES(
1563
8.85k
                    Py_UCS4, Py_UCS1,
1564
8.85k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1565
8.85k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1566
8.85k
                    PyUnicode_1BYTE_DATA(to) + to_start
1567
8.85k
                    );
1568
8.85k
            }
1569
9.37k
            else if (from_kind == PyUnicode_4BYTE_KIND
1570
9.37k
                     && to_kind == PyUnicode_2BYTE_KIND)
1571
9.37k
            {
1572
9.37k
                _PyUnicode_CONVERT_BYTES(
1573
9.37k
                    Py_UCS4, Py_UCS2,
1574
9.37k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1575
9.37k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1576
9.37k
                    PyUnicode_2BYTE_DATA(to) + to_start
1577
9.37k
                    );
1578
9.37k
            }
1579
0
            else {
1580
0
                Py_UNREACHABLE();
1581
0
            }
1582
20.5k
        }
1583
0
        else {
1584
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1585
0
            Py_UCS4 ch;
1586
0
            Py_ssize_t i;
1587
1588
0
            for (i=0; i < how_many; i++) {
1589
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1590
0
                if (ch > to_maxchar)
1591
0
                    return -1;
1592
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1593
0
            }
1594
0
        }
1595
20.5k
    }
1596
252M
    return 0;
1597
252M
}
1598
1599
void
1600
_PyUnicode_FastCopyCharacters(
1601
    PyObject *to, Py_ssize_t to_start,
1602
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1603
252M
{
1604
252M
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1605
252M
}
1606
1607
Py_ssize_t
1608
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1609
                         PyObject *from, Py_ssize_t from_start,
1610
                         Py_ssize_t how_many)
1611
0
{
1612
0
    int err;
1613
1614
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1615
0
        PyErr_BadInternalCall();
1616
0
        return -1;
1617
0
    }
1618
1619
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1620
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1621
0
        return -1;
1622
0
    }
1623
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1624
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1625
0
        return -1;
1626
0
    }
1627
0
    if (how_many < 0) {
1628
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1629
0
        return -1;
1630
0
    }
1631
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1632
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1633
0
        PyErr_Format(PyExc_SystemError,
1634
0
                     "Cannot write %zi characters at %zi "
1635
0
                     "in a string of %zi characters",
1636
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1637
0
        return -1;
1638
0
    }
1639
1640
0
    if (how_many == 0)
1641
0
        return 0;
1642
1643
0
    if (unicode_check_modifiable(to))
1644
0
        return -1;
1645
1646
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1647
0
    if (err) {
1648
0
        PyErr_Format(PyExc_SystemError,
1649
0
                     "Cannot copy %s characters "
1650
0
                     "into a string of %s characters",
1651
0
                     unicode_kind_name(from),
1652
0
                     unicode_kind_name(to));
1653
0
        return -1;
1654
0
    }
1655
0
    return how_many;
1656
0
}
1657
1658
/* Find the maximum code point and count the number of surrogate pairs so a
1659
   correct string length can be computed before converting a string to UCS4.
1660
   This function counts single surrogates as a character and not as a pair.
1661
1662
   Return 0 on success, or -1 on error. */
1663
static int
1664
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1665
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1666
17.8k
{
1667
17.8k
    const wchar_t *iter;
1668
17.8k
    Py_UCS4 ch;
1669
1670
17.8k
    assert(num_surrogates != NULL && maxchar != NULL);
1671
17.8k
    *num_surrogates = 0;
1672
17.8k
    *maxchar = 0;
1673
1674
398k
    for (iter = begin; iter < end; ) {
1675
#if SIZEOF_WCHAR_T == 2
1676
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1677
            && (iter+1) < end
1678
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1679
        {
1680
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1681
            ++(*num_surrogates);
1682
            iter += 2;
1683
        }
1684
        else
1685
#endif
1686
380k
        {
1687
380k
            ch = *iter;
1688
380k
            iter++;
1689
380k
        }
1690
380k
        if (ch > *maxchar) {
1691
77.8k
            *maxchar = ch;
1692
77.8k
            if (*maxchar > MAX_UNICODE) {
1693
0
                PyErr_Format(PyExc_ValueError,
1694
0
                             "character U+%x is not in range [U+0000; U+%x]",
1695
0
                             ch, MAX_UNICODE);
1696
0
                return -1;
1697
0
            }
1698
77.8k
        }
1699
380k
    }
1700
17.8k
    return 0;
1701
17.8k
}
1702
1703
static void
1704
unicode_dealloc(PyObject *unicode)
1705
491M
{
1706
#ifdef Py_DEBUG
1707
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1708
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1709
    }
1710
#endif
1711
491M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1712
        /* This should never get called, but we also don't want to SEGV if
1713
        * we accidentally decref an immortal string out of existence. Since
1714
        * the string is an immortal object, just re-set the reference count.
1715
        */
1716
#ifdef Py_DEBUG
1717
        Py_UNREACHABLE();
1718
#endif
1719
0
        _Py_SetImmortal(unicode);
1720
0
        return;
1721
0
    }
1722
491M
    switch (_PyUnicode_STATE(unicode).interned) {
1723
490M
        case SSTATE_NOT_INTERNED:
1724
490M
            break;
1725
420k
        case SSTATE_INTERNED_MORTAL:
1726
            /* Remove the object from the intern dict.
1727
             * Before doing so, we set the refcount to 2: the key and value
1728
             * in the interned_dict.
1729
             */
1730
420k
            assert(Py_REFCNT(unicode) == 0);
1731
420k
            Py_SET_REFCNT(unicode, 2);
1732
#ifdef Py_REF_DEBUG
1733
            /* let's be pedantic with the ref total */
1734
            _Py_IncRefTotal(_PyThreadState_GET());
1735
            _Py_IncRefTotal(_PyThreadState_GET());
1736
#endif
1737
420k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1738
420k
            PyObject *interned = get_interned_dict(interp);
1739
420k
            assert(interned != NULL);
1740
420k
            PyObject *popped;
1741
420k
            int r = PyDict_Pop(interned, unicode, &popped);
1742
420k
            if (r == -1) {
1743
0
                PyErr_FormatUnraisable("Exception ignored while "
1744
0
                                       "removing an interned string %R",
1745
0
                                       unicode);
1746
                // We don't know what happened to the string. It's probably
1747
                // best to leak it:
1748
                // - if it was popped, there are no more references to it
1749
                //   so it can't cause trouble (except wasted memory)
1750
                // - if it wasn't popped, it'll remain interned
1751
0
                _Py_SetImmortal(unicode);
1752
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1753
0
                return;
1754
0
            }
1755
420k
            if (r == 0) {
1756
                // The interned string was not found in the interned_dict.
1757
#ifdef Py_DEBUG
1758
                Py_UNREACHABLE();
1759
#endif
1760
0
                _Py_SetImmortal(unicode);
1761
0
                return;
1762
0
            }
1763
            // Successfully popped.
1764
420k
            assert(popped == unicode);
1765
            // Only our `popped` reference should be left; remove it too.
1766
420k
            assert(Py_REFCNT(unicode) == 1);
1767
420k
            Py_SET_REFCNT(unicode, 0);
1768
#ifdef Py_REF_DEBUG
1769
            /* let's be pedantic with the ref total */
1770
            _Py_DecRefTotal(_PyThreadState_GET());
1771
#endif
1772
420k
            break;
1773
0
        default:
1774
            // As with `statically_allocated` above.
1775
#ifdef Py_REF_DEBUG
1776
            Py_UNREACHABLE();
1777
#endif
1778
0
            _Py_SetImmortal(unicode);
1779
0
            return;
1780
491M
    }
1781
491M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1782
152k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1783
152k
    }
1784
491M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1785
9.82M
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1786
9.82M
    }
1787
1788
491M
    Py_TYPE(unicode)->tp_free(unicode);
1789
491M
}
1790
1791
#ifdef Py_DEBUG
1792
static int
1793
unicode_is_singleton(PyObject *unicode)
1794
{
1795
    if (unicode == &_Py_STR(empty)) {
1796
        return 1;
1797
    }
1798
1799
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1800
    if (ascii->length == 1) {
1801
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1802
        if (ch < 256 && LATIN1(ch) == unicode) {
1803
            return 1;
1804
        }
1805
    }
1806
    return 0;
1807
}
1808
#endif
1809
1810
static int
1811
unicode_modifiable(PyObject *unicode)
1812
58.7M
{
1813
58.7M
    assert(_PyUnicode_CHECK(unicode));
1814
58.7M
    if (!_PyObject_IsUniquelyReferenced(unicode))
1815
45.4k
        return 0;
1816
58.7M
    if (PyUnicode_HASH(unicode) != -1)
1817
0
        return 0;
1818
58.7M
    if (PyUnicode_CHECK_INTERNED(unicode))
1819
0
        return 0;
1820
58.7M
    if (!PyUnicode_CheckExact(unicode))
1821
0
        return 0;
1822
#ifdef Py_DEBUG
1823
    /* singleton refcount is greater than 1 */
1824
    assert(!unicode_is_singleton(unicode));
1825
#endif
1826
58.7M
    return 1;
1827
58.7M
}
1828
1829
static int
1830
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1831
744k
{
1832
744k
    PyObject *unicode;
1833
744k
    Py_ssize_t old_length;
1834
1835
744k
    assert(p_unicode != NULL);
1836
744k
    unicode = *p_unicode;
1837
1838
744k
    assert(unicode != NULL);
1839
744k
    assert(PyUnicode_Check(unicode));
1840
744k
    assert(0 <= length);
1841
1842
744k
    old_length = PyUnicode_GET_LENGTH(unicode);
1843
744k
    if (old_length == length)
1844
0
        return 0;
1845
1846
744k
    if (length == 0) {
1847
0
        PyObject *empty = unicode_get_empty();
1848
0
        Py_SETREF(*p_unicode, empty);
1849
0
        return 0;
1850
0
    }
1851
1852
744k
    if (!unicode_modifiable(unicode)) {
1853
0
        PyObject *copy = resize_copy(unicode, length);
1854
0
        if (copy == NULL)
1855
0
            return -1;
1856
0
        Py_SETREF(*p_unicode, copy);
1857
0
        return 0;
1858
0
    }
1859
1860
744k
    if (PyUnicode_IS_COMPACT(unicode)) {
1861
744k
        PyObject *new_unicode = resize_compact(unicode, length);
1862
744k
        if (new_unicode == NULL)
1863
0
            return -1;
1864
744k
        *p_unicode = new_unicode;
1865
744k
        return 0;
1866
744k
    }
1867
0
    return resize_inplace(unicode, length);
1868
744k
}
1869
1870
int
1871
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1872
0
{
1873
0
    PyObject *unicode;
1874
0
    if (p_unicode == NULL) {
1875
0
        PyErr_BadInternalCall();
1876
0
        return -1;
1877
0
    }
1878
0
    unicode = *p_unicode;
1879
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1880
0
    {
1881
0
        PyErr_BadInternalCall();
1882
0
        return -1;
1883
0
    }
1884
0
    return unicode_resize(p_unicode, length);
1885
0
}
1886
1887
/* Copy an ASCII or latin1 char* string into a Python Unicode string.
1888
1889
   WARNING: The function doesn't copy the terminating null character and
1890
   doesn't check the maximum character (may write a latin1 character in an
1891
   ASCII string). */
1892
static void
1893
unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1894
                   const char *str, Py_ssize_t len)
1895
0
{
1896
0
    int kind = PyUnicode_KIND(unicode);
1897
0
    const void *data = PyUnicode_DATA(unicode);
1898
0
    const char *end = str + len;
1899
1900
0
    assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1901
0
    switch (kind) {
1902
0
    case PyUnicode_1BYTE_KIND: {
1903
#ifdef Py_DEBUG
1904
        if (PyUnicode_IS_ASCII(unicode)) {
1905
            Py_UCS4 maxchar = ucs1lib_find_max_char(
1906
                (const Py_UCS1*)str,
1907
                (const Py_UCS1*)str + len);
1908
            assert(maxchar < 128);
1909
        }
1910
#endif
1911
0
        memcpy((char *) data + index, str, len);
1912
0
        break;
1913
0
    }
1914
0
    case PyUnicode_2BYTE_KIND: {
1915
0
        Py_UCS2 *start = (Py_UCS2 *)data + index;
1916
0
        Py_UCS2 *ucs2 = start;
1917
1918
0
        for (; str < end; ++ucs2, ++str)
1919
0
            *ucs2 = (Py_UCS2)*str;
1920
1921
0
        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1922
0
        break;
1923
0
    }
1924
0
    case PyUnicode_4BYTE_KIND: {
1925
0
        Py_UCS4 *start = (Py_UCS4 *)data + index;
1926
0
        Py_UCS4 *ucs4 = start;
1927
1928
0
        for (; str < end; ++ucs4, ++str)
1929
0
            *ucs4 = (Py_UCS4)*str;
1930
1931
0
        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1932
0
        break;
1933
0
    }
1934
0
    default:
1935
0
        Py_UNREACHABLE();
1936
0
    }
1937
0
}
1938
1939
static PyObject*
1940
get_latin1_char(Py_UCS1 ch)
1941
243M
{
1942
243M
    PyObject *o = LATIN1(ch);
1943
243M
    return o;
1944
243M
}
1945
1946
static PyObject*
1947
unicode_char(Py_UCS4 ch)
1948
303M
{
1949
303M
    PyObject *unicode;
1950
1951
303M
    assert(ch <= MAX_UNICODE);
1952
1953
303M
    if (ch < 256) {
1954
183M
        return get_latin1_char(ch);
1955
183M
    }
1956
1957
119M
    unicode = PyUnicode_New(1, ch);
1958
119M
    if (unicode == NULL)
1959
0
        return NULL;
1960
1961
119M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1962
119M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1963
115M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1964
115M
    } else {
1965
3.40M
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1966
3.40M
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1967
3.40M
    }
1968
119M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1969
119M
    return unicode;
1970
119M
}
1971
1972
1973
static inline void
1974
unicode_write_widechar(int kind, void *data,
1975
                       const wchar_t *u, Py_ssize_t size,
1976
                       Py_ssize_t num_surrogates)
1977
17.8k
{
1978
17.8k
    switch (kind) {
1979
17.8k
    case PyUnicode_1BYTE_KIND:
1980
17.8k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1981
17.8k
        break;
1982
1983
0
    case PyUnicode_2BYTE_KIND:
1984
#if SIZEOF_WCHAR_T == 2
1985
        memcpy(data, u, size * 2);
1986
#else
1987
0
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1988
0
#endif
1989
0
        break;
1990
1991
0
    case PyUnicode_4BYTE_KIND:
1992
0
    {
1993
#if SIZEOF_WCHAR_T == 2
1994
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
1995
        // surrogate pairs.
1996
        const wchar_t *end = u + size;
1997
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
1998
#  ifndef NDEBUG
1999
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
2000
#  endif
2001
        for (const wchar_t *iter = u; iter < end; ) {
2002
            assert(ucs4_out < ucs4_end);
2003
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
2004
                && (iter+1) < end
2005
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
2006
            {
2007
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
2008
                iter += 2;
2009
            }
2010
            else {
2011
                *ucs4_out++ = *iter;
2012
                iter++;
2013
            }
2014
        }
2015
        assert(ucs4_out == ucs4_end);
2016
#else
2017
0
        assert(num_surrogates == 0);
2018
0
        memcpy(data, u, size * 4);
2019
0
#endif
2020
0
        break;
2021
0
    }
2022
0
    default:
2023
0
        Py_UNREACHABLE();
2024
17.8k
    }
2025
17.8k
}
2026
2027
2028
PyObject *
2029
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2030
17.8k
{
2031
17.8k
    PyObject *unicode;
2032
17.8k
    Py_UCS4 maxchar = 0;
2033
17.8k
    Py_ssize_t num_surrogates;
2034
2035
17.8k
    if (u == NULL && size != 0) {
2036
0
        PyErr_BadInternalCall();
2037
0
        return NULL;
2038
0
    }
2039
2040
17.8k
    if (size == -1) {
2041
576
        size = wcslen(u);
2042
576
    }
2043
2044
    /* If the Unicode data is known at construction time, we can apply
2045
       some optimizations which share commonly used objects. */
2046
2047
    /* Optimization for empty strings */
2048
17.8k
    if (size == 0)
2049
32
        _Py_RETURN_UNICODE_EMPTY();
2050
2051
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2052
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
2053
       non-Unicode locales and hence needs conversion to UCS-4 first. */
2054
    if (_Py_LocaleUsesNonUnicodeWchar()) {
2055
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2056
        if (!converted) {
2057
            return NULL;
2058
        }
2059
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2060
        PyMem_Free(converted);
2061
        return unicode;
2062
    }
2063
#endif
2064
2065
    /* Single character Unicode objects in the Latin-1 range are
2066
       shared when using this constructor */
2067
17.8k
    if (size == 1 && (Py_UCS4)*u < 256)
2068
0
        return get_latin1_char((unsigned char)*u);
2069
2070
    /* If not empty and not single character, copy the Unicode data
2071
       into the new object */
2072
17.8k
    if (find_maxchar_surrogates(u, u + size,
2073
17.8k
                                &maxchar, &num_surrogates) == -1)
2074
0
        return NULL;
2075
2076
17.8k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
2077
17.8k
    if (!unicode)
2078
0
        return NULL;
2079
2080
17.8k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2081
17.8k
                           u, size, num_surrogates);
2082
2083
17.8k
    return unicode_result(unicode);
2084
17.8k
}
2085
2086
2087
int
2088
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
2089
                              const wchar_t *str,
2090
                              Py_ssize_t size)
2091
0
{
2092
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
2093
2094
0
    if (size < 0) {
2095
0
        size = wcslen(str);
2096
0
    }
2097
2098
0
    if (size == 0) {
2099
0
        return 0;
2100
0
    }
2101
2102
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2103
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
2104
       non-Unicode locales and hence needs conversion to UCS-4 first. */
2105
    if (_Py_LocaleUsesNonUnicodeWchar()) {
2106
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
2107
        if (!converted) {
2108
            return -1;
2109
        }
2110
2111
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
2112
        PyMem_Free(converted);
2113
        return res;
2114
    }
2115
#endif
2116
2117
0
    Py_UCS4 maxchar = 0;
2118
0
    Py_ssize_t num_surrogates;
2119
0
    if (find_maxchar_surrogates(str, str + size,
2120
0
                                &maxchar, &num_surrogates) == -1) {
2121
0
        return -1;
2122
0
    }
2123
2124
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
2125
0
        return -1;
2126
0
    }
2127
2128
0
    int kind = writer->kind;
2129
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2130
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
2131
2132
0
    writer->pos += size - num_surrogates;
2133
0
    return 0;
2134
0
}
2135
2136
2137
PyObject *
2138
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2139
576k
{
2140
576k
    if (size < 0) {
2141
0
        PyErr_SetString(PyExc_SystemError,
2142
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2143
0
        return NULL;
2144
0
    }
2145
576k
    if (u != NULL) {
2146
576k
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2147
576k
    }
2148
0
    if (size > 0) {
2149
0
        PyErr_SetString(PyExc_SystemError,
2150
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2151
0
        return NULL;
2152
0
    }
2153
0
    return unicode_get_empty();
2154
0
}
2155
2156
PyObject *
2157
PyUnicode_FromString(const char *u)
2158
6.96M
{
2159
6.96M
    size_t size = strlen(u);
2160
6.96M
    if (size > PY_SSIZE_T_MAX) {
2161
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2162
0
        return NULL;
2163
0
    }
2164
6.96M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2165
6.96M
}
2166
2167
2168
PyObject *
2169
_PyUnicode_FromId(_Py_Identifier *id)
2170
0
{
2171
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2172
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2173
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2174
2175
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2176
0
    if (index < 0) {
2177
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2178
2179
0
        PyMutex_Lock(&rt_ids->mutex);
2180
        // Check again to detect concurrent access. Another thread can have
2181
        // initialized the index while this thread waited for the lock.
2182
0
        index = _Py_atomic_load_ssize(&id->index);
2183
0
        if (index < 0) {
2184
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2185
0
            index = rt_ids->next_index;
2186
0
            rt_ids->next_index++;
2187
0
            _Py_atomic_store_ssize(&id->index, index);
2188
0
        }
2189
0
        PyMutex_Unlock(&rt_ids->mutex);
2190
0
    }
2191
0
    assert(index >= 0);
2192
2193
0
    PyObject *obj;
2194
0
    if (index < ids->size) {
2195
0
        obj = ids->array[index];
2196
0
        if (obj) {
2197
            // Return a borrowed reference
2198
0
            goto end;
2199
0
        }
2200
0
    }
2201
2202
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2203
0
                                       NULL, NULL);
2204
0
    if (!obj) {
2205
0
        goto end;
2206
0
    }
2207
0
    _PyUnicode_InternImmortal(interp, &obj);
2208
2209
0
    if (index >= ids->size) {
2210
        // Overallocate to reduce the number of realloc
2211
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2212
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2213
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2214
0
        if (new_array == NULL) {
2215
0
            PyErr_NoMemory();
2216
0
            obj = NULL;
2217
0
            goto end;
2218
0
        }
2219
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2220
0
        ids->array = new_array;
2221
0
        ids->size = new_size;
2222
0
    }
2223
2224
    // The array stores a strong reference
2225
0
    ids->array[index] = obj;
2226
2227
0
end:
2228
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2229
    // Return a borrowed reference
2230
0
    return obj;
2231
0
}
2232
2233
2234
static void
2235
unicode_clear_identifiers(struct _Py_unicode_state *state)
2236
0
{
2237
0
    struct _Py_unicode_ids *ids = &state->ids;
2238
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2239
0
        Py_XDECREF(ids->array[i]);
2240
0
    }
2241
0
    ids->size = 0;
2242
0
    PyMem_Free(ids->array);
2243
0
    ids->array = NULL;
2244
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2245
    // after Py_Finalize().
2246
0
}
2247
2248
2249
/* Internal function, doesn't check maximum character */
2250
2251
PyObject*
2252
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2253
88.6M
{
2254
88.6M
    const unsigned char *s = (const unsigned char *)buffer;
2255
88.6M
    PyObject *unicode;
2256
88.6M
    if (size == 1) {
2257
#ifdef Py_DEBUG
2258
        assert((unsigned char)s[0] < 128);
2259
#endif
2260
36.5M
        return get_latin1_char(s[0]);
2261
36.5M
    }
2262
52.0M
    unicode = PyUnicode_New(size, 127);
2263
52.0M
    if (!unicode)
2264
0
        return NULL;
2265
52.0M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2266
52.0M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2267
52.0M
    return unicode;
2268
52.0M
}
2269
2270
static Py_UCS4
2271
kind_maxchar_limit(int kind)
2272
0
{
2273
0
    switch (kind) {
2274
0
    case PyUnicode_1BYTE_KIND:
2275
0
        return 0x80;
2276
0
    case PyUnicode_2BYTE_KIND:
2277
0
        return 0x100;
2278
0
    case PyUnicode_4BYTE_KIND:
2279
0
        return 0x10000;
2280
0
    default:
2281
0
        Py_UNREACHABLE();
2282
0
    }
2283
0
}
2284
2285
static PyObject*
2286
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2287
62.8M
{
2288
62.8M
    PyObject *res;
2289
62.8M
    unsigned char max_char;
2290
2291
62.8M
    if (size == 0) {
2292
4.55M
        _Py_RETURN_UNICODE_EMPTY();
2293
4.55M
    }
2294
58.2M
    assert(size > 0);
2295
58.2M
    if (size == 1) {
2296
21.1M
        return get_latin1_char(u[0]);
2297
21.1M
    }
2298
2299
37.1M
    max_char = ucs1lib_find_max_char(u, u + size);
2300
37.1M
    res = PyUnicode_New(size, max_char);
2301
37.1M
    if (!res)
2302
0
        return NULL;
2303
37.1M
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2304
37.1M
    assert(_PyUnicode_CheckConsistency(res, 1));
2305
37.1M
    return res;
2306
37.1M
}
2307
2308
static PyObject*
2309
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2310
105M
{
2311
105M
    PyObject *res;
2312
105M
    Py_UCS2 max_char;
2313
2314
105M
    if (size == 0)
2315
9.73M
        _Py_RETURN_UNICODE_EMPTY();
2316
95.5M
    assert(size > 0);
2317
95.5M
    if (size == 1)
2318
68.2M
        return unicode_char(u[0]);
2319
2320
27.3M
    max_char = ucs2lib_find_max_char(u, u + size);
2321
27.3M
    res = PyUnicode_New(size, max_char);
2322
27.3M
    if (!res)
2323
0
        return NULL;
2324
27.3M
    if (max_char >= 256)
2325
15.7M
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2326
11.6M
    else {
2327
11.6M
        _PyUnicode_CONVERT_BYTES(
2328
11.6M
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2329
11.6M
    }
2330
27.3M
    assert(_PyUnicode_CheckConsistency(res, 1));
2331
27.3M
    return res;
2332
27.3M
}
2333
2334
static PyObject*
2335
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2336
84.3M
{
2337
84.3M
    PyObject *res;
2338
84.3M
    Py_UCS4 max_char;
2339
2340
84.3M
    if (size == 0)
2341
6.81M
        _Py_RETURN_UNICODE_EMPTY();
2342
77.5M
    assert(size > 0);
2343
77.5M
    if (size == 1)
2344
58.4M
        return unicode_char(u[0]);
2345
2346
19.1M
    max_char = ucs4lib_find_max_char(u, u + size);
2347
19.1M
    res = PyUnicode_New(size, max_char);
2348
19.1M
    if (!res)
2349
0
        return NULL;
2350
19.1M
    if (max_char < 256)
2351
13.9M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2352
19.1M
                                 PyUnicode_1BYTE_DATA(res));
2353
5.17M
    else if (max_char < 0x10000)
2354
3.46M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2355
5.17M
                                 PyUnicode_2BYTE_DATA(res));
2356
1.71M
    else
2357
1.71M
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2358
19.1M
    assert(_PyUnicode_CheckConsistency(res, 1));
2359
19.1M
    return res;
2360
19.1M
}
2361
2362
2363
int
2364
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2365
                          Py_UCS4 *str,
2366
                          Py_ssize_t size)
2367
0
{
2368
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2369
2370
0
    if (size < 0) {
2371
0
        PyErr_SetString(PyExc_ValueError,
2372
0
                        "size must be positive");
2373
0
        return -1;
2374
0
    }
2375
2376
0
    if (size == 0) {
2377
0
        return 0;
2378
0
    }
2379
2380
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2381
2382
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2383
0
        return -1;
2384
0
    }
2385
2386
0
    int kind = writer->kind;
2387
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2388
0
    if (kind == PyUnicode_1BYTE_KIND) {
2389
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2390
0
                                 str, str + size,
2391
0
                                 data);
2392
0
    }
2393
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2394
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2395
0
                                 str, str + size,
2396
0
                                 data);
2397
0
    }
2398
0
    else {
2399
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2400
0
    }
2401
0
    writer->pos += size;
2402
2403
0
    return 0;
2404
0
}
2405
2406
2407
PyObject*
2408
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2409
209M
{
2410
209M
    if (size < 0) {
2411
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2412
0
        return NULL;
2413
0
    }
2414
209M
    switch (kind) {
2415
43.1M
    case PyUnicode_1BYTE_KIND:
2416
43.1M
        return _PyUnicode_FromUCS1(buffer, size);
2417
92.6M
    case PyUnicode_2BYTE_KIND:
2418
92.6M
        return _PyUnicode_FromUCS2(buffer, size);
2419
73.7M
    case PyUnicode_4BYTE_KIND:
2420
73.7M
        return _PyUnicode_FromUCS4(buffer, size);
2421
0
    default:
2422
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2423
0
        return NULL;
2424
209M
    }
2425
209M
}
2426
2427
Py_UCS4
2428
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2429
14.0M
{
2430
14.0M
    int kind;
2431
14.0M
    const void *startptr, *endptr;
2432
2433
14.0M
    assert(0 <= start);
2434
14.0M
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2435
14.0M
    assert(start <= end);
2436
2437
14.0M
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2438
0
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2439
2440
14.0M
    if (start == end)
2441
0
        return 127;
2442
2443
14.0M
    if (PyUnicode_IS_ASCII(unicode))
2444
13.9M
        return 127;
2445
2446
28.7k
    kind = PyUnicode_KIND(unicode);
2447
28.7k
    startptr = PyUnicode_DATA(unicode);
2448
28.7k
    endptr = (char *)startptr + end * kind;
2449
28.7k
    startptr = (char *)startptr + start * kind;
2450
28.7k
    switch(kind) {
2451
1.61k
    case PyUnicode_1BYTE_KIND:
2452
1.61k
        return ucs1lib_find_max_char(startptr, endptr);
2453
5.01k
    case PyUnicode_2BYTE_KIND:
2454
5.01k
        return ucs2lib_find_max_char(startptr, endptr);
2455
22.0k
    case PyUnicode_4BYTE_KIND:
2456
22.0k
        return ucs4lib_find_max_char(startptr, endptr);
2457
0
    default:
2458
0
        Py_UNREACHABLE();
2459
28.7k
    }
2460
28.7k
}
2461
2462
/* Ensure that a string uses the most efficient storage, if it is not the
2463
   case: create a new string with of the right kind. Write NULL into *p_unicode
2464
   on error. */
2465
static void
2466
unicode_adjust_maxchar(PyObject **p_unicode)
2467
0
{
2468
0
    PyObject *unicode, *copy;
2469
0
    Py_UCS4 max_char;
2470
0
    Py_ssize_t len;
2471
0
    int kind;
2472
2473
0
    assert(p_unicode != NULL);
2474
0
    unicode = *p_unicode;
2475
0
    if (PyUnicode_IS_ASCII(unicode))
2476
0
        return;
2477
2478
0
    len = PyUnicode_GET_LENGTH(unicode);
2479
0
    kind = PyUnicode_KIND(unicode);
2480
0
    if (kind == PyUnicode_1BYTE_KIND) {
2481
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2482
0
        max_char = ucs1lib_find_max_char(u, u + len);
2483
0
        if (max_char >= 128)
2484
0
            return;
2485
0
    }
2486
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2487
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2488
0
        max_char = ucs2lib_find_max_char(u, u + len);
2489
0
        if (max_char >= 256)
2490
0
            return;
2491
0
    }
2492
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2493
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2494
0
        max_char = ucs4lib_find_max_char(u, u + len);
2495
0
        if (max_char >= 0x10000)
2496
0
            return;
2497
0
    }
2498
0
    else
2499
0
        Py_UNREACHABLE();
2500
2501
0
    copy = PyUnicode_New(len, max_char);
2502
0
    if (copy != NULL)
2503
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2504
0
    Py_DECREF(unicode);
2505
0
    *p_unicode = copy;
2506
0
}
2507
2508
PyObject*
2509
_PyUnicode_Copy(PyObject *unicode)
2510
2.92M
{
2511
2.92M
    Py_ssize_t length;
2512
2.92M
    PyObject *copy;
2513
2514
2.92M
    if (!PyUnicode_Check(unicode)) {
2515
0
        PyErr_BadInternalCall();
2516
0
        return NULL;
2517
0
    }
2518
2519
2.92M
    length = PyUnicode_GET_LENGTH(unicode);
2520
2.92M
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2521
2.92M
    if (!copy)
2522
0
        return NULL;
2523
2.92M
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2524
2525
2.92M
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2526
2.92M
              length * PyUnicode_KIND(unicode));
2527
2.92M
    assert(_PyUnicode_CheckConsistency(copy, 1));
2528
2.92M
    return copy;
2529
2.92M
}
2530
2531
2532
/* Widen Unicode objects to larger buffers. Don't write terminating null
2533
   character. Return NULL on error. */
2534
2535
static void*
2536
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2537
11.3M
{
2538
11.3M
    void *result;
2539
2540
11.3M
    assert(skind < kind);
2541
11.3M
    switch (kind) {
2542
10.4M
    case PyUnicode_2BYTE_KIND:
2543
10.4M
        result = PyMem_New(Py_UCS2, len);
2544
10.4M
        if (!result)
2545
0
            return PyErr_NoMemory();
2546
10.4M
        assert(skind == PyUnicode_1BYTE_KIND);
2547
10.4M
        _PyUnicode_CONVERT_BYTES(
2548
10.4M
            Py_UCS1, Py_UCS2,
2549
10.4M
            (const Py_UCS1 *)data,
2550
10.4M
            ((const Py_UCS1 *)data) + len,
2551
10.4M
            result);
2552
10.4M
        return result;
2553
949k
    case PyUnicode_4BYTE_KIND:
2554
949k
        result = PyMem_New(Py_UCS4, len);
2555
949k
        if (!result)
2556
0
            return PyErr_NoMemory();
2557
949k
        if (skind == PyUnicode_2BYTE_KIND) {
2558
0
            _PyUnicode_CONVERT_BYTES(
2559
0
                Py_UCS2, Py_UCS4,
2560
0
                (const Py_UCS2 *)data,
2561
0
                ((const Py_UCS2 *)data) + len,
2562
0
                result);
2563
0
        }
2564
949k
        else {
2565
949k
            assert(skind == PyUnicode_1BYTE_KIND);
2566
949k
            _PyUnicode_CONVERT_BYTES(
2567
949k
                Py_UCS1, Py_UCS4,
2568
949k
                (const Py_UCS1 *)data,
2569
949k
                ((const Py_UCS1 *)data) + len,
2570
949k
                result);
2571
949k
        }
2572
949k
        return result;
2573
0
    default:
2574
0
        Py_UNREACHABLE();
2575
0
        return NULL;
2576
11.3M
    }
2577
11.3M
}
2578
2579
static Py_UCS4*
2580
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2581
        int copy_null)
2582
73.3k
{
2583
73.3k
    int kind;
2584
73.3k
    const void *data;
2585
73.3k
    Py_ssize_t len, targetlen;
2586
73.3k
    kind = PyUnicode_KIND(string);
2587
73.3k
    data = PyUnicode_DATA(string);
2588
73.3k
    len = PyUnicode_GET_LENGTH(string);
2589
73.3k
    targetlen = len;
2590
73.3k
    if (copy_null)
2591
0
        targetlen++;
2592
73.3k
    if (!target) {
2593
0
        target = PyMem_New(Py_UCS4, targetlen);
2594
0
        if (!target) {
2595
0
            PyErr_NoMemory();
2596
0
            return NULL;
2597
0
        }
2598
0
    }
2599
73.3k
    else {
2600
73.3k
        if (targetsize < targetlen) {
2601
0
            PyErr_Format(PyExc_SystemError,
2602
0
                         "string is longer than the buffer");
2603
0
            if (copy_null && 0 < targetsize)
2604
0
                target[0] = 0;
2605
0
            return NULL;
2606
0
        }
2607
73.3k
    }
2608
73.3k
    if (kind == PyUnicode_1BYTE_KIND) {
2609
55.2k
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2610
55.2k
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2611
55.2k
    }
2612
18.1k
    else if (kind == PyUnicode_2BYTE_KIND) {
2613
13.8k
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2614
13.8k
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2615
13.8k
    }
2616
4.24k
    else if (kind == PyUnicode_4BYTE_KIND) {
2617
4.24k
        memcpy(target, data, len * sizeof(Py_UCS4));
2618
4.24k
    }
2619
0
    else {
2620
0
        Py_UNREACHABLE();
2621
0
    }
2622
73.3k
    if (copy_null)
2623
0
        target[len] = 0;
2624
73.3k
    return target;
2625
73.3k
}
2626
2627
Py_UCS4*
2628
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2629
                 int copy_null)
2630
73.3k
{
2631
73.3k
    if (target == NULL || targetsize < 0) {
2632
0
        PyErr_BadInternalCall();
2633
0
        return NULL;
2634
0
    }
2635
73.3k
    return as_ucs4(string, target, targetsize, copy_null);
2636
73.3k
}
2637
2638
Py_UCS4*
2639
PyUnicode_AsUCS4Copy(PyObject *string)
2640
0
{
2641
0
    return as_ucs4(string, NULL, 0, 1);
2642
0
}
2643
2644
/* maximum number of characters required for output of %jo or %jd or %p.
2645
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2646
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2647
   plus 1 for the terminal NUL. */
2648
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2649
2650
static int
2651
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2652
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2653
15.1k
{
2654
15.1k
    Py_ssize_t length, fill, arglen;
2655
15.1k
    Py_UCS4 maxchar;
2656
2657
15.1k
    length = PyUnicode_GET_LENGTH(str);
2658
15.1k
    if ((precision == -1 || precision >= length)
2659
15.1k
        && width <= length)
2660
15.0k
        return _PyUnicodeWriter_WriteStr(writer, str);
2661
2662
50
    if (precision != -1)
2663
50
        length = Py_MIN(precision, length);
2664
2665
50
    arglen = Py_MAX(length, width);
2666
50
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2667
26
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2668
24
    else
2669
24
        maxchar = writer->maxchar;
2670
2671
50
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2672
0
        return -1;
2673
2674
50
    fill = Py_MAX(width - length, 0);
2675
50
    if (fill && !(flags & F_LJUST)) {
2676
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2677
0
            return -1;
2678
0
        writer->pos += fill;
2679
0
    }
2680
2681
50
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2682
50
                                  str, 0, length);
2683
50
    writer->pos += length;
2684
2685
50
    if (fill && (flags & F_LJUST)) {
2686
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2687
0
            return -1;
2688
0
        writer->pos += fill;
2689
0
    }
2690
2691
50
    return 0;
2692
50
}
2693
2694
static int
2695
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2696
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2697
4.55M
{
2698
    /* UTF-8 */
2699
4.55M
    Py_ssize_t *pconsumed = NULL;
2700
4.55M
    Py_ssize_t length;
2701
4.55M
    if (precision == -1) {
2702
206k
        length = strlen(str);
2703
206k
    }
2704
4.34M
    else {
2705
4.34M
        length = 0;
2706
18.0M
        while (length < precision && str[length]) {
2707
13.6M
            length++;
2708
13.6M
        }
2709
4.34M
        if (length == precision) {
2710
            /* The input string is not NUL-terminated.  If it ends with an
2711
             * incomplete UTF-8 sequence, truncate the string just before it.
2712
             * Incomplete sequences in the middle and sequences which cannot
2713
             * be valid prefixes are still treated as errors and replaced
2714
             * with \xfffd. */
2715
2.38k
            pconsumed = &length;
2716
2.38k
        }
2717
4.34M
    }
2718
2719
4.55M
    if (width < 0) {
2720
4.55M
        return unicode_decode_utf8_writer(writer, str, length,
2721
4.55M
                                          _Py_ERROR_REPLACE, "replace", pconsumed);
2722
4.55M
    }
2723
2724
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2725
0
                                                     "replace", pconsumed);
2726
0
    if (unicode == NULL)
2727
0
        return -1;
2728
2729
0
    int res = unicode_fromformat_write_str(writer, unicode,
2730
0
                                           width, -1, flags);
2731
0
    Py_DECREF(unicode);
2732
0
    return res;
2733
0
}
2734
2735
static int
2736
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2737
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2738
0
{
2739
0
    Py_ssize_t length;
2740
0
    if (precision == -1) {
2741
0
        length = wcslen(str);
2742
0
    }
2743
0
    else {
2744
0
        length = 0;
2745
0
        while (length < precision && str[length]) {
2746
0
            length++;
2747
0
        }
2748
0
    }
2749
2750
0
    if (width < 0) {
2751
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2752
0
                                             str, length);
2753
0
    }
2754
2755
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2756
0
    if (unicode == NULL)
2757
0
        return -1;
2758
2759
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2760
0
    Py_DECREF(unicode);
2761
0
    return res;
2762
0
}
2763
2764
0
#define F_LONG 1
2765
0
#define F_LONGLONG 2
2766
74.4k
#define F_SIZE 3
2767
0
#define F_PTRDIFF 4
2768
0
#define F_INTMAX 5
2769
2770
static const char*
2771
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2772
                       const char *f, va_list *vargs)
2773
33.0M
{
2774
33.0M
    const char *p;
2775
33.0M
    Py_ssize_t len;
2776
33.0M
    int flags = 0;
2777
33.0M
    Py_ssize_t width;
2778
33.0M
    Py_ssize_t precision;
2779
2780
33.0M
    p = f;
2781
33.0M
    f++;
2782
33.0M
    if (*f == '%') {
2783
4.33M
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2784
0
            return NULL;
2785
4.33M
        f++;
2786
4.33M
        return f;
2787
4.33M
    }
2788
2789
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2790
    /* Flags '+', ' ' and '#' are not particularly useful.
2791
     * They are not worth the implementation and maintenance costs.
2792
     * In addition, '#' should add "0" for "o" conversions for compatibility
2793
     * with printf, but it would confuse Python users. */
2794
28.6M
    while (1) {
2795
28.6M
        switch (*f++) {
2796
0
        case '-': flags |= F_LJUST; continue;
2797
2.69k
        case '0': flags |= F_ZERO; continue;
2798
0
        case '#': flags |= F_ALT; continue;
2799
28.6M
        }
2800
28.6M
        f--;
2801
28.6M
        break;
2802
28.6M
    }
2803
2804
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2805
28.6M
    width = -1;
2806
28.6M
    if (*f == '*') {
2807
0
        width = va_arg(*vargs, int);
2808
0
        if (width < 0) {
2809
0
            flags |= F_LJUST;
2810
0
            width = -width;
2811
0
        }
2812
0
        f++;
2813
0
    }
2814
28.6M
    else if (Py_ISDIGIT((unsigned)*f)) {
2815
2.69k
        width = *f - '0';
2816
2.69k
        f++;
2817
2.69k
        while (Py_ISDIGIT((unsigned)*f)) {
2818
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2819
0
                PyErr_SetString(PyExc_ValueError,
2820
0
                                "width too big");
2821
0
                return NULL;
2822
0
            }
2823
0
            width = (width * 10) + (*f - '0');
2824
0
            f++;
2825
0
        }
2826
2.69k
    }
2827
28.6M
    precision = -1;
2828
28.6M
    if (*f == '.') {
2829
4.34M
        f++;
2830
4.34M
        if (*f == '*') {
2831
0
            precision = va_arg(*vargs, int);
2832
0
            if (precision < 0) {
2833
0
                precision = -2;
2834
0
            }
2835
0
            f++;
2836
0
        }
2837
4.34M
        else if (Py_ISDIGIT((unsigned)*f)) {
2838
4.34M
            precision = (*f - '0');
2839
4.34M
            f++;
2840
13.0M
            while (Py_ISDIGIT((unsigned)*f)) {
2841
8.68M
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2842
0
                    PyErr_SetString(PyExc_ValueError,
2843
0
                                    "precision too big");
2844
0
                    return NULL;
2845
0
                }
2846
8.68M
                precision = (precision * 10) + (*f - '0');
2847
8.68M
                f++;
2848
8.68M
            }
2849
4.34M
        }
2850
4.34M
    }
2851
2852
28.6M
    int sizemod = 0;
2853
28.6M
    if (*f == 'l') {
2854
0
        if (f[1] == 'l') {
2855
0
            sizemod = F_LONGLONG;
2856
0
            f += 2;
2857
0
        }
2858
0
        else {
2859
0
            sizemod = F_LONG;
2860
0
            ++f;
2861
0
        }
2862
0
    }
2863
28.6M
    else if (*f == 'z') {
2864
37.2k
        sizemod = F_SIZE;
2865
37.2k
        ++f;
2866
37.2k
    }
2867
28.6M
    else if (*f == 't') {
2868
0
        sizemod = F_PTRDIFF;
2869
0
        ++f;
2870
0
    }
2871
28.6M
    else if (*f == 'j') {
2872
0
        sizemod = F_INTMAX;
2873
0
        ++f;
2874
0
    }
2875
28.6M
    if (f[0] != '\0' && f[1] == '\0')
2876
4.43M
        writer->overallocate = 0;
2877
2878
28.6M
    switch (*f) {
2879
19.7M
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2880
19.7M
        break;
2881
4.34M
    case 'c': case 'p':
2882
4.34M
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2883
4.34M
        break;
2884
4.55M
    case 's':
2885
4.55M
    case 'V':
2886
4.55M
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2887
4.55M
        break;
2888
4.55M
    default:
2889
15.1k
        if (sizemod) goto invalid_format;
2890
15.1k
        break;
2891
28.6M
    }
2892
2893
28.6M
    switch (*f) {
2894
4.34M
    case 'c':
2895
4.34M
    {
2896
4.34M
        int ordinal = va_arg(*vargs, int);
2897
4.34M
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2898
0
            PyErr_SetString(PyExc_OverflowError,
2899
0
                            "character argument not in range(0x110000)");
2900
0
            return NULL;
2901
0
        }
2902
4.34M
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2903
0
            return NULL;
2904
4.34M
        break;
2905
4.34M
    }
2906
2907
19.7M
    case 'd': case 'i':
2908
19.7M
    case 'o': case 'u': case 'x': case 'X':
2909
19.7M
    {
2910
19.7M
        char buffer[MAX_INTMAX_CHARS];
2911
2912
        // Fill buffer using sprinf, with one of many possible format
2913
        // strings, like "%llX" for `long long` in hexadecimal.
2914
        // The type/size is in `sizemod`; the format is in `*f`.
2915
2916
        // Use macros with nested switches to keep the sprintf format strings
2917
        // as compile-time literals, avoiding warnings and maybe allowing
2918
        // optimizations.
2919
2920
        // `SPRINT` macro does one sprintf
2921
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2922
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2923
19.7M
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2924
19.7M
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2925
2926
        // One inner switch to handle all format variants
2927
19.7M
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2928
19.7M
            switch (*f) {                                                     \
2929
0
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2930
0
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2931
1.52k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2932
1.17k
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2933
19.7M
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2934
19.7M
            }
2935
2936
        // Outer switch to handle all the sizes/types
2937
19.7M
        switch (sizemod) {
2938
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2939
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2940
37.2k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2941
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2942
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2943
19.7M
            default:         DO_SPRINTS("", int, unsigned int); break;
2944
19.7M
        }
2945
19.7M
        #undef SPRINT
2946
19.7M
        #undef DO_SPRINTS
2947
2948
19.7M
        assert(len >= 0);
2949
2950
19.7M
        int sign = (buffer[0] == '-');
2951
19.7M
        len -= sign;
2952
2953
19.7M
        precision = Py_MAX(precision, len);
2954
19.7M
        width = Py_MAX(width, precision + sign);
2955
19.7M
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2956
2.69k
            precision = width - sign;
2957
2.69k
        }
2958
2959
19.7M
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2960
19.7M
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2961
2962
19.7M
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2963
0
            return NULL;
2964
2965
19.7M
        if (spacepad && !(flags & F_LJUST)) {
2966
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2967
0
                return NULL;
2968
0
            writer->pos += spacepad;
2969
0
        }
2970
2971
19.7M
        if (sign) {
2972
0
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2973
0
                return NULL;
2974
0
        }
2975
2976
19.7M
        if (zeropad) {
2977
738
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2978
0
                return NULL;
2979
738
            writer->pos += zeropad;
2980
738
        }
2981
2982
19.7M
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2983
0
            return NULL;
2984
2985
19.7M
        if (spacepad && (flags & F_LJUST)) {
2986
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2987
0
                return NULL;
2988
0
            writer->pos += spacepad;
2989
0
        }
2990
19.7M
        break;
2991
19.7M
    }
2992
2993
19.7M
    case 'p':
2994
0
    {
2995
0
        char number[MAX_INTMAX_CHARS];
2996
2997
0
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2998
0
        assert(len >= 0);
2999
3000
        /* %p is ill-defined:  ensure leading 0x. */
3001
0
        if (number[1] == 'X')
3002
0
            number[1] = 'x';
3003
0
        else if (number[1] != 'x') {
3004
0
            memmove(number + 2, number,
3005
0
                    strlen(number) + 1);
3006
0
            number[0] = '0';
3007
0
            number[1] = 'x';
3008
0
            len += 2;
3009
0
        }
3010
3011
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
3012
0
            return NULL;
3013
0
        break;
3014
0
    }
3015
3016
4.55M
    case 's':
3017
4.55M
    {
3018
4.55M
        if (sizemod) {
3019
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
3020
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
3021
0
                return NULL;
3022
0
        }
3023
4.55M
        else {
3024
            /* UTF-8 */
3025
4.55M
            const char *s = va_arg(*vargs, const char*);
3026
4.55M
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
3027
0
                return NULL;
3028
4.55M
        }
3029
4.55M
        break;
3030
4.55M
    }
3031
3032
4.55M
    case 'U':
3033
14.4k
    {
3034
14.4k
        PyObject *obj = va_arg(*vargs, PyObject *);
3035
14.4k
        assert(obj && _PyUnicode_CHECK(obj));
3036
3037
14.4k
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
3038
0
            return NULL;
3039
14.4k
        break;
3040
14.4k
    }
3041
3042
14.4k
    case 'V':
3043
0
    {
3044
0
        PyObject *obj = va_arg(*vargs, PyObject *);
3045
0
        const char *str;
3046
0
        const wchar_t *wstr;
3047
0
        if (sizemod) {
3048
0
            wstr = va_arg(*vargs, const wchar_t*);
3049
0
        }
3050
0
        else {
3051
0
            str = va_arg(*vargs, const char *);
3052
0
        }
3053
0
        if (obj) {
3054
0
            assert(_PyUnicode_CHECK(obj));
3055
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
3056
0
                return NULL;
3057
0
        }
3058
0
        else if (sizemod) {
3059
0
            assert(wstr != NULL);
3060
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
3061
0
                return NULL;
3062
0
        }
3063
0
        else {
3064
0
            assert(str != NULL);
3065
0
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
3066
0
                return NULL;
3067
0
        }
3068
0
        break;
3069
0
    }
3070
3071
22
    case 'S':
3072
22
    {
3073
22
        PyObject *obj = va_arg(*vargs, PyObject *);
3074
22
        PyObject *str;
3075
22
        assert(obj);
3076
22
        str = PyObject_Str(obj);
3077
22
        if (!str)
3078
0
            return NULL;
3079
22
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
3080
0
            Py_DECREF(str);
3081
0
            return NULL;
3082
0
        }
3083
22
        Py_DECREF(str);
3084
22
        break;
3085
22
    }
3086
3087
629
    case 'R':
3088
629
    {
3089
629
        PyObject *obj = va_arg(*vargs, PyObject *);
3090
629
        PyObject *repr;
3091
629
        assert(obj);
3092
629
        repr = PyObject_Repr(obj);
3093
629
        if (!repr)
3094
0
            return NULL;
3095
629
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
3096
0
            Py_DECREF(repr);
3097
0
            return NULL;
3098
0
        }
3099
629
        Py_DECREF(repr);
3100
629
        break;
3101
629
    }
3102
3103
0
    case 'A':
3104
0
    {
3105
0
        PyObject *obj = va_arg(*vargs, PyObject *);
3106
0
        PyObject *ascii;
3107
0
        assert(obj);
3108
0
        ascii = PyObject_ASCII(obj);
3109
0
        if (!ascii)
3110
0
            return NULL;
3111
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
3112
0
            Py_DECREF(ascii);
3113
0
            return NULL;
3114
0
        }
3115
0
        Py_DECREF(ascii);
3116
0
        break;
3117
0
    }
3118
3119
0
    case 'T':
3120
0
    {
3121
0
        PyObject *obj = va_arg(*vargs, PyObject *);
3122
0
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
3123
3124
0
        PyObject *type_name;
3125
0
        if (flags & F_ALT) {
3126
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3127
0
        }
3128
0
        else {
3129
0
            type_name = PyType_GetFullyQualifiedName(type);
3130
0
        }
3131
0
        Py_DECREF(type);
3132
0
        if (!type_name) {
3133
0
            return NULL;
3134
0
        }
3135
3136
0
        if (unicode_fromformat_write_str(writer, type_name,
3137
0
                                         width, precision, flags) == -1) {
3138
0
            Py_DECREF(type_name);
3139
0
            return NULL;
3140
0
        }
3141
0
        Py_DECREF(type_name);
3142
0
        break;
3143
0
    }
3144
3145
0
    case 'N':
3146
0
    {
3147
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3148
0
        assert(type_raw != NULL);
3149
3150
0
        if (!PyType_Check(type_raw)) {
3151
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3152
0
            return NULL;
3153
0
        }
3154
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3155
3156
0
        PyObject *type_name;
3157
0
        if (flags & F_ALT) {
3158
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3159
0
        }
3160
0
        else {
3161
0
            type_name = PyType_GetFullyQualifiedName(type);
3162
0
        }
3163
0
        if (!type_name) {
3164
0
            return NULL;
3165
0
        }
3166
0
        if (unicode_fromformat_write_str(writer, type_name,
3167
0
                                         width, precision, flags) == -1) {
3168
0
            Py_DECREF(type_name);
3169
0
            return NULL;
3170
0
        }
3171
0
        Py_DECREF(type_name);
3172
0
        break;
3173
0
    }
3174
3175
0
    default:
3176
0
    invalid_format:
3177
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3178
0
        return NULL;
3179
28.6M
    }
3180
3181
28.6M
    f++;
3182
28.6M
    return f;
3183
28.6M
}
3184
3185
static int
3186
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3187
14.4M
{
3188
14.4M
    Py_ssize_t len = strlen(format);
3189
14.4M
    writer->min_length += len + 100;
3190
14.4M
    writer->overallocate = 1;
3191
3192
    // Copy varags to be able to pass a reference to a subfunction.
3193
14.4M
    va_list vargs2;
3194
14.4M
    va_copy(vargs2, vargs);
3195
3196
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3197
    // to be encoded to ASCII.
3198
14.4M
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3199
14.4M
    if (!is_ascii) {
3200
0
        Py_ssize_t i;
3201
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3202
0
        PyErr_Format(PyExc_ValueError,
3203
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3204
0
            "string, got a non-ASCII byte: 0x%02x",
3205
0
            (unsigned char)format[i]);
3206
0
        goto fail;
3207
0
    }
3208
3209
81.6M
    for (const char *f = format; *f; ) {
3210
67.2M
        if (*f == '%') {
3211
33.0M
            f = unicode_fromformat_arg(writer, f, &vargs2);
3212
33.0M
            if (f == NULL)
3213
0
                goto fail;
3214
33.0M
        }
3215
34.2M
        else {
3216
34.2M
            const char *p = strchr(f, '%');
3217
34.2M
            if (p != NULL) {
3218
24.2M
                len = p - f;
3219
24.2M
            }
3220
9.96M
            else {
3221
9.96M
                len = strlen(f);
3222
9.96M
                writer->overallocate = 0;
3223
9.96M
            }
3224
3225
34.2M
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3226
0
                goto fail;
3227
0
            }
3228
34.2M
            f += len;
3229
34.2M
        }
3230
67.2M
    }
3231
14.4M
    va_end(vargs2);
3232
14.4M
    return 0;
3233
3234
0
  fail:
3235
0
    va_end(vargs2);
3236
0
    return -1;
3237
14.4M
}
3238
3239
PyObject *
3240
PyUnicode_FromFormatV(const char *format, va_list vargs)
3241
14.4M
{
3242
14.4M
    _PyUnicodeWriter writer;
3243
14.4M
    _PyUnicodeWriter_Init(&writer);
3244
3245
14.4M
    if (unicode_from_format(&writer, format, vargs) < 0) {
3246
0
        _PyUnicodeWriter_Dealloc(&writer);
3247
0
        return NULL;
3248
0
    }
3249
14.4M
    return _PyUnicodeWriter_Finish(&writer);
3250
14.4M
}
3251
3252
PyObject *
3253
PyUnicode_FromFormat(const char *format, ...)
3254
13.8k
{
3255
13.8k
    PyObject* ret;
3256
13.8k
    va_list vargs;
3257
3258
13.8k
    va_start(vargs, format);
3259
13.8k
    ret = PyUnicode_FromFormatV(format, vargs);
3260
13.8k
    va_end(vargs);
3261
13.8k
    return ret;
3262
13.8k
}
3263
3264
int
3265
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3266
0
{
3267
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3268
0
    Py_ssize_t old_pos = _writer->pos;
3269
3270
0
    va_list vargs;
3271
0
    va_start(vargs, format);
3272
0
    int res = unicode_from_format(_writer, format, vargs);
3273
0
    va_end(vargs);
3274
3275
0
    if (res < 0) {
3276
0
        _writer->pos = old_pos;
3277
0
    }
3278
0
    return res;
3279
0
}
3280
3281
static Py_ssize_t
3282
unicode_get_widechar_size(PyObject *unicode)
3283
1.72k
{
3284
1.72k
    Py_ssize_t res;
3285
3286
1.72k
    assert(unicode != NULL);
3287
1.72k
    assert(_PyUnicode_CHECK(unicode));
3288
3289
1.72k
    res = _PyUnicode_LENGTH(unicode);
3290
#if SIZEOF_WCHAR_T == 2
3291
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3292
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3293
        const Py_UCS4 *end = s + res;
3294
        for (; s < end; ++s) {
3295
            if (*s > 0xFFFF) {
3296
                ++res;
3297
            }
3298
        }
3299
    }
3300
#endif
3301
1.72k
    return res;
3302
1.72k
}
3303
3304
static void
3305
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3306
1.72k
{
3307
1.72k
    assert(unicode != NULL);
3308
1.72k
    assert(_PyUnicode_CHECK(unicode));
3309
3310
1.72k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3311
0
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3312
0
        return;
3313
0
    }
3314
3315
1.72k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3316
1.72k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3317
109k
        for (; size--; ++s, ++w) {
3318
107k
            *w = *s;
3319
107k
        }
3320
1.72k
    }
3321
0
    else {
3322
0
#if SIZEOF_WCHAR_T == 4
3323
0
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3324
0
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3325
0
        for (; size--; ++s, ++w) {
3326
0
            *w = *s;
3327
0
        }
3328
#else
3329
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3330
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3331
        for (; size--; ++s, ++w) {
3332
            Py_UCS4 ch = *s;
3333
            if (ch > 0xFFFF) {
3334
                assert(ch <= MAX_UNICODE);
3335
                /* encode surrogate pair in this case */
3336
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3337
                if (!size--)
3338
                    break;
3339
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3340
            }
3341
            else {
3342
                *w = ch;
3343
            }
3344
        }
3345
#endif
3346
0
    }
3347
1.72k
}
3348
3349
#ifdef HAVE_WCHAR_H
3350
3351
/* Convert a Unicode object to a wide character string.
3352
3353
   - If w is NULL: return the number of wide characters (including the null
3354
     character) required to convert the unicode object. Ignore size argument.
3355
3356
   - Otherwise: return the number of wide characters (excluding the null
3357
     character) written into w. Write at most size wide characters (including
3358
     the null character). */
3359
Py_ssize_t
3360
PyUnicode_AsWideChar(PyObject *unicode,
3361
                     wchar_t *w,
3362
                     Py_ssize_t size)
3363
448
{
3364
448
    Py_ssize_t res;
3365
3366
448
    if (unicode == NULL) {
3367
0
        PyErr_BadInternalCall();
3368
0
        return -1;
3369
0
    }
3370
448
    if (!PyUnicode_Check(unicode)) {
3371
0
        PyErr_BadArgument();
3372
0
        return -1;
3373
0
    }
3374
3375
448
    res = unicode_get_widechar_size(unicode);
3376
448
    if (w == NULL) {
3377
0
        return res + 1;
3378
0
    }
3379
3380
448
    if (size > res) {
3381
448
        size = res + 1;
3382
448
    }
3383
0
    else {
3384
0
        res = size;
3385
0
    }
3386
448
    unicode_copy_as_widechar(unicode, w, size);
3387
3388
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3389
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3390
       non-Unicode locales and hence needs conversion first. */
3391
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3392
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3393
            return -1;
3394
        }
3395
    }
3396
#endif
3397
3398
448
    return res;
3399
448
}
3400
3401
wchar_t*
3402
PyUnicode_AsWideCharString(PyObject *unicode,
3403
                           Py_ssize_t *size)
3404
1.27k
{
3405
1.27k
    wchar_t *buffer;
3406
1.27k
    Py_ssize_t buflen;
3407
3408
1.27k
    if (unicode == NULL) {
3409
0
        PyErr_BadInternalCall();
3410
0
        return NULL;
3411
0
    }
3412
1.27k
    if (!PyUnicode_Check(unicode)) {
3413
0
        PyErr_BadArgument();
3414
0
        return NULL;
3415
0
    }
3416
3417
1.27k
    buflen = unicode_get_widechar_size(unicode);
3418
1.27k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3419
1.27k
    if (buffer == NULL) {
3420
0
        PyErr_NoMemory();
3421
0
        return NULL;
3422
0
    }
3423
1.27k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3424
3425
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3426
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3427
       non-Unicode locales and hence needs conversion first. */
3428
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3429
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3430
            return NULL;
3431
        }
3432
    }
3433
#endif
3434
3435
1.27k
    if (size != NULL) {
3436
826
        *size = buflen;
3437
826
    }
3438
448
    else if (wcslen(buffer) != (size_t)buflen) {
3439
0
        PyMem_Free(buffer);
3440
0
        PyErr_SetString(PyExc_ValueError,
3441
0
                        "embedded null character");
3442
0
        return NULL;
3443
0
    }
3444
1.27k
    return buffer;
3445
1.27k
}
3446
3447
#endif /* HAVE_WCHAR_H */
3448
3449
int
3450
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3451
0
{
3452
0
    wchar_t **p = (wchar_t **)ptr;
3453
0
    if (obj == NULL) {
3454
0
        PyMem_Free(*p);
3455
0
        *p = NULL;
3456
0
        return 1;
3457
0
    }
3458
0
    if (PyUnicode_Check(obj)) {
3459
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3460
0
        if (*p == NULL) {
3461
0
            return 0;
3462
0
        }
3463
0
        return Py_CLEANUP_SUPPORTED;
3464
0
    }
3465
0
    PyErr_Format(PyExc_TypeError,
3466
0
                 "argument must be str, not %.50s",
3467
0
                 Py_TYPE(obj)->tp_name);
3468
0
    return 0;
3469
0
}
3470
3471
int
3472
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3473
0
{
3474
0
    wchar_t **p = (wchar_t **)ptr;
3475
0
    if (obj == NULL) {
3476
0
        PyMem_Free(*p);
3477
0
        *p = NULL;
3478
0
        return 1;
3479
0
    }
3480
0
    if (obj == Py_None) {
3481
0
        *p = NULL;
3482
0
        return 1;
3483
0
    }
3484
0
    if (PyUnicode_Check(obj)) {
3485
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3486
0
        if (*p == NULL) {
3487
0
            return 0;
3488
0
        }
3489
0
        return Py_CLEANUP_SUPPORTED;
3490
0
    }
3491
0
    PyErr_Format(PyExc_TypeError,
3492
0
                 "argument must be str or None, not %.50s",
3493
0
                 Py_TYPE(obj)->tp_name);
3494
0
    return 0;
3495
0
}
3496
3497
PyObject *
3498
PyUnicode_FromOrdinal(int ordinal)
3499
211k
{
3500
211k
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3501
0
        PyErr_SetString(PyExc_ValueError,
3502
0
                        "chr() arg not in range(0x110000)");
3503
0
        return NULL;
3504
0
    }
3505
3506
211k
    return unicode_char((Py_UCS4)ordinal);
3507
211k
}
3508
3509
PyObject *
3510
PyUnicode_FromObject(PyObject *obj)
3511
4.31M
{
3512
    /* XXX Perhaps we should make this API an alias of
3513
       PyObject_Str() instead ?! */
3514
4.31M
    if (PyUnicode_CheckExact(obj)) {
3515
4.31M
        return Py_NewRef(obj);
3516
4.31M
    }
3517
0
    if (PyUnicode_Check(obj)) {
3518
        /* For a Unicode subtype that's not a Unicode object,
3519
           return a true Unicode object with the same data. */
3520
0
        return _PyUnicode_Copy(obj);
3521
0
    }
3522
0
    PyErr_Format(PyExc_TypeError,
3523
0
                 "Can't convert '%.100s' object to str implicitly",
3524
0
                 Py_TYPE(obj)->tp_name);
3525
0
    return NULL;
3526
0
}
3527
3528
PyObject *
3529
PyUnicode_FromEncodedObject(PyObject *obj,
3530
                            const char *encoding,
3531
                            const char *errors)
3532
5.76M
{
3533
5.76M
    Py_buffer buffer;
3534
5.76M
    PyObject *v;
3535
3536
5.76M
    if (obj == NULL) {
3537
0
        PyErr_BadInternalCall();
3538
0
        return NULL;
3539
0
    }
3540
3541
    /* Decoding bytes objects is the most common case and should be fast */
3542
5.76M
    if (PyBytes_Check(obj)) {
3543
5.25M
        if (PyBytes_GET_SIZE(obj) == 0) {
3544
1.15M
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3545
0
                return NULL;
3546
0
            }
3547
1.15M
            _Py_RETURN_UNICODE_EMPTY();
3548
1.15M
        }
3549
4.10M
        return PyUnicode_Decode(
3550
4.10M
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3551
4.10M
                encoding, errors);
3552
5.25M
    }
3553
3554
513k
    if (PyUnicode_Check(obj)) {
3555
0
        PyErr_SetString(PyExc_TypeError,
3556
0
                        "decoding str is not supported");
3557
0
        return NULL;
3558
0
    }
3559
3560
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3561
513k
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3562
0
        PyErr_Format(PyExc_TypeError,
3563
0
                     "decoding to str: need a bytes-like object, %.80s found",
3564
0
                     Py_TYPE(obj)->tp_name);
3565
0
        return NULL;
3566
0
    }
3567
3568
513k
    if (buffer.len == 0) {
3569
0
        PyBuffer_Release(&buffer);
3570
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3571
0
            return NULL;
3572
0
        }
3573
0
        _Py_RETURN_UNICODE_EMPTY();
3574
0
    }
3575
3576
513k
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3577
513k
    PyBuffer_Release(&buffer);
3578
513k
    return v;
3579
513k
}
3580
3581
/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3582
   also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3583
   longer than lower_len-1). */
3584
int
3585
_Py_normalize_encoding(const char *encoding,
3586
                       char *lower,
3587
                       size_t lower_len)
3588
10.1M
{
3589
10.1M
    const char *e;
3590
10.1M
    char *l;
3591
10.1M
    char *l_end;
3592
10.1M
    int punct;
3593
3594
10.1M
    assert(encoding != NULL);
3595
3596
10.1M
    e = encoding;
3597
10.1M
    l = lower;
3598
10.1M
    l_end = &lower[lower_len - 1];
3599
10.1M
    punct = 0;
3600
157M
    while (1) {
3601
157M
        char c = *e;
3602
157M
        if (c == 0) {
3603
9.42M
            break;
3604
9.42M
        }
3605
3606
148M
        if (Py_ISALNUM(c) || c == '.') {
3607
70.1M
            if (punct && l != lower) {
3608
9.50M
                if (l == l_end) {
3609
1.41k
                    return 0;
3610
1.41k
                }
3611
9.50M
                *l++ = '_';
3612
9.50M
            }
3613
70.1M
            punct = 0;
3614
3615
70.1M
            if (l == l_end) {
3616
725k
                return 0;
3617
725k
            }
3618
69.3M
            *l++ = Py_TOLOWER(c);
3619
69.3M
        }
3620
78.2M
        else {
3621
78.2M
            punct = 1;
3622
78.2M
        }
3623
3624
147M
        e++;
3625
147M
    }
3626
9.42M
    *l = '\0';
3627
9.42M
    return 1;
3628
10.1M
}
3629
3630
PyObject *
3631
PyUnicode_Decode(const char *s,
3632
                 Py_ssize_t size,
3633
                 const char *encoding,
3634
                 const char *errors)
3635
4.62M
{
3636
4.62M
    PyObject *buffer = NULL, *unicode;
3637
4.62M
    Py_buffer info;
3638
4.62M
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3639
3640
4.62M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3641
0
        return NULL;
3642
0
    }
3643
3644
4.62M
    if (size == 0) {
3645
0
        _Py_RETURN_UNICODE_EMPTY();
3646
0
    }
3647
3648
4.62M
    if (encoding == NULL) {
3649
32.8k
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3650
32.8k
    }
3651
3652
    /* Shortcuts for common default encodings */
3653
4.58M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3654
4.58M
        char *lower = buflower;
3655
3656
        /* Fast paths */
3657
4.58M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3658
773k
            lower += 3;
3659
773k
            if (*lower == '_') {
3660
                /* Match "utf8" and "utf_8" */
3661
773k
                lower++;
3662
773k
            }
3663
3664
773k
            if (lower[0] == '8' && lower[1] == 0) {
3665
772k
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3666
772k
            }
3667
914
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3668
150
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3669
150
            }
3670
764
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3671
121
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3672
121
            }
3673
773k
        }
3674
3.80M
        else {
3675
3.80M
            if (strcmp(lower, "ascii") == 0
3676
3.80M
                || strcmp(lower, "us_ascii") == 0) {
3677
726k
                return PyUnicode_DecodeASCII(s, size, errors);
3678
726k
            }
3679
    #ifdef MS_WINDOWS
3680
            else if (strcmp(lower, "mbcs") == 0) {
3681
                return PyUnicode_DecodeMBCS(s, size, errors);
3682
            }
3683
    #endif
3684
3.07M
            else if (strcmp(lower, "latin1") == 0
3685
3.07M
                     || strcmp(lower, "latin_1") == 0
3686
3.07M
                     || strcmp(lower, "iso_8859_1") == 0
3687
3.07M
                     || strcmp(lower, "iso8859_1") == 0) {
3688
2.75M
                return PyUnicode_DecodeLatin1(s, size, errors);
3689
2.75M
            }
3690
3.80M
        }
3691
4.58M
    }
3692
3693
    /* Decode via the codec registry */
3694
332k
    buffer = NULL;
3695
332k
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3696
0
        goto onError;
3697
332k
    buffer = PyMemoryView_FromBuffer(&info);
3698
332k
    if (buffer == NULL)
3699
0
        goto onError;
3700
332k
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3701
332k
    if (unicode == NULL)
3702
138k
        goto onError;
3703
193k
    if (!PyUnicode_Check(unicode)) {
3704
0
        PyErr_Format(PyExc_TypeError,
3705
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3706
0
                     "use codecs.decode() to decode to arbitrary types",
3707
0
                     encoding,
3708
0
                     Py_TYPE(unicode)->tp_name);
3709
0
        Py_DECREF(unicode);
3710
0
        goto onError;
3711
0
    }
3712
193k
    Py_DECREF(buffer);
3713
193k
    return unicode_result(unicode);
3714
3715
138k
  onError:
3716
138k
    Py_XDECREF(buffer);
3717
138k
    return NULL;
3718
193k
}
3719
3720
PyAPI_FUNC(PyObject *)
3721
PyUnicode_AsDecodedObject(PyObject *unicode,
3722
                          const char *encoding,
3723
                          const char *errors)
3724
0
{
3725
0
    if (!PyUnicode_Check(unicode)) {
3726
0
        PyErr_BadArgument();
3727
0
        return NULL;
3728
0
    }
3729
3730
0
    if (encoding == NULL)
3731
0
        encoding = PyUnicode_GetDefaultEncoding();
3732
3733
    /* Decode via the codec registry */
3734
0
    return PyCodec_Decode(unicode, encoding, errors);
3735
0
}
3736
3737
PyAPI_FUNC(PyObject *)
3738
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3739
                           const char *encoding,
3740
                           const char *errors)
3741
0
{
3742
0
    PyObject *v;
3743
3744
0
    if (!PyUnicode_Check(unicode)) {
3745
0
        PyErr_BadArgument();
3746
0
        goto onError;
3747
0
    }
3748
3749
0
    if (encoding == NULL)
3750
0
        encoding = PyUnicode_GetDefaultEncoding();
3751
3752
    /* Decode via the codec registry */
3753
0
    v = PyCodec_Decode(unicode, encoding, errors);
3754
0
    if (v == NULL)
3755
0
        goto onError;
3756
0
    if (!PyUnicode_Check(v)) {
3757
0
        PyErr_Format(PyExc_TypeError,
3758
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3759
0
                     "use codecs.decode() to decode to arbitrary types",
3760
0
                     encoding,
3761
0
                     Py_TYPE(unicode)->tp_name);
3762
0
        Py_DECREF(v);
3763
0
        goto onError;
3764
0
    }
3765
0
    return unicode_result(v);
3766
3767
0
  onError:
3768
0
    return NULL;
3769
0
}
3770
3771
PyAPI_FUNC(PyObject *)
3772
PyUnicode_AsEncodedObject(PyObject *unicode,
3773
                          const char *encoding,
3774
                          const char *errors)
3775
0
{
3776
0
    PyObject *v;
3777
3778
0
    if (!PyUnicode_Check(unicode)) {
3779
0
        PyErr_BadArgument();
3780
0
        goto onError;
3781
0
    }
3782
3783
0
    if (encoding == NULL)
3784
0
        encoding = PyUnicode_GetDefaultEncoding();
3785
3786
    /* Encode via the codec registry */
3787
0
    v = PyCodec_Encode(unicode, encoding, errors);
3788
0
    if (v == NULL)
3789
0
        goto onError;
3790
0
    return v;
3791
3792
0
  onError:
3793
0
    return NULL;
3794
0
}
3795
3796
3797
static PyObject *
3798
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3799
                      int current_locale)
3800
426
{
3801
426
    Py_ssize_t wlen;
3802
426
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3803
426
    if (wstr == NULL) {
3804
0
        return NULL;
3805
0
    }
3806
3807
426
    if ((size_t)wlen != wcslen(wstr)) {
3808
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3809
0
        PyMem_Free(wstr);
3810
0
        return NULL;
3811
0
    }
3812
3813
426
    char *str;
3814
426
    size_t error_pos;
3815
426
    const char *reason;
3816
426
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3817
426
                                 current_locale, error_handler);
3818
426
    PyMem_Free(wstr);
3819
3820
426
    if (res != 0) {
3821
0
        if (res == -2) {
3822
0
            PyObject *exc;
3823
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3824
0
                    "locale", unicode,
3825
0
                    (Py_ssize_t)error_pos,
3826
0
                    (Py_ssize_t)(error_pos+1),
3827
0
                    reason);
3828
0
            if (exc != NULL) {
3829
0
                PyCodec_StrictErrors(exc);
3830
0
                Py_DECREF(exc);
3831
0
            }
3832
0
        }
3833
0
        else if (res == -3) {
3834
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3835
0
        }
3836
0
        else {
3837
0
            PyErr_NoMemory();
3838
0
        }
3839
0
        return NULL;
3840
0
    }
3841
3842
426
    PyObject *bytes = PyBytes_FromString(str);
3843
426
    PyMem_RawFree(str);
3844
426
    return bytes;
3845
426
}
3846
3847
PyObject *
3848
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3849
0
{
3850
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3851
0
    return unicode_encode_locale(unicode, error_handler, 1);
3852
0
}
3853
3854
PyObject *
3855
PyUnicode_EncodeFSDefault(PyObject *unicode)
3856
18.3k
{
3857
18.3k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3858
18.3k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3859
18.3k
    if (fs_codec->utf8) {
3860
17.9k
        return unicode_encode_utf8(unicode,
3861
17.9k
                                   fs_codec->error_handler,
3862
17.9k
                                   fs_codec->errors);
3863
17.9k
    }
3864
426
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3865
426
    else if (fs_codec->encoding) {
3866
0
        return PyUnicode_AsEncodedString(unicode,
3867
0
                                         fs_codec->encoding,
3868
0
                                         fs_codec->errors);
3869
0
    }
3870
426
#endif
3871
426
    else {
3872
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3873
           machinery is not ready and so cannot be used:
3874
           use wcstombs() in this case. */
3875
426
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3876
426
        const wchar_t *filesystem_errors = config->filesystem_errors;
3877
426
        assert(filesystem_errors != NULL);
3878
426
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3879
426
        assert(errors != _Py_ERROR_UNKNOWN);
3880
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3881
        return unicode_encode_utf8(unicode, errors, NULL);
3882
#else
3883
426
        return unicode_encode_locale(unicode, errors, 0);
3884
426
#endif
3885
426
    }
3886
18.3k
}
3887
3888
PyObject *
3889
PyUnicode_AsEncodedString(PyObject *unicode,
3890
                          const char *encoding,
3891
                          const char *errors)
3892
14.9M
{
3893
14.9M
    PyObject *v;
3894
14.9M
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3895
3896
14.9M
    if (!PyUnicode_Check(unicode)) {
3897
0
        PyErr_BadArgument();
3898
0
        return NULL;
3899
0
    }
3900
3901
14.9M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3902
0
        return NULL;
3903
0
    }
3904
3905
14.9M
    if (encoding == NULL) {
3906
10.4M
        return _PyUnicode_AsUTF8String(unicode, errors);
3907
10.4M
    }
3908
3909
    /* Shortcuts for common default encodings */
3910
4.49M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3911
3.77M
        char *lower = buflower;
3912
3913
        /* Fast paths */
3914
3.77M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3915
3.65M
            lower += 3;
3916
3.65M
            if (*lower == '_') {
3917
                /* Match "utf8" and "utf_8" */
3918
3.65M
                lower++;
3919
3.65M
            }
3920
3921
3.65M
            if (lower[0] == '8' && lower[1] == 0) {
3922
3.65M
                return _PyUnicode_AsUTF8String(unicode, errors);
3923
3.65M
            }
3924
0
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3925
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3926
0
            }
3927
0
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3928
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3929
0
            }
3930
3.65M
        }
3931
116k
        else {
3932
116k
            if (strcmp(lower, "ascii") == 0
3933
116k
                || strcmp(lower, "us_ascii") == 0) {
3934
98.2k
                return _PyUnicode_AsASCIIString(unicode, errors);
3935
98.2k
            }
3936
#ifdef MS_WINDOWS
3937
            else if (strcmp(lower, "mbcs") == 0) {
3938
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3939
            }
3940
#endif
3941
18.5k
            else if (strcmp(lower, "latin1") == 0 ||
3942
18.5k
                     strcmp(lower, "latin_1") == 0 ||
3943
18.5k
                     strcmp(lower, "iso_8859_1") == 0 ||
3944
18.5k
                     strcmp(lower, "iso8859_1") == 0) {
3945
0
                return _PyUnicode_AsLatin1String(unicode, errors);
3946
0
            }
3947
116k
        }
3948
3.77M
    }
3949
3950
    /* Encode via the codec registry */
3951
737k
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3952
737k
    if (v == NULL)
3953
0
        return NULL;
3954
3955
    /* The normal path */
3956
737k
    if (PyBytes_Check(v))
3957
737k
        return v;
3958
3959
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3960
0
    if (PyByteArray_Check(v)) {
3961
0
        int error;
3962
0
        PyObject *b;
3963
3964
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3965
0
            "encoder %s returned bytearray instead of bytes; "
3966
0
            "use codecs.encode() to encode to arbitrary types",
3967
0
            encoding);
3968
0
        if (error) {
3969
0
            Py_DECREF(v);
3970
0
            return NULL;
3971
0
        }
3972
3973
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3974
0
                                      PyByteArray_GET_SIZE(v));
3975
0
        Py_DECREF(v);
3976
0
        return b;
3977
0
    }
3978
3979
0
    PyErr_Format(PyExc_TypeError,
3980
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3981
0
                 "use codecs.encode() to encode to arbitrary types",
3982
0
                 encoding,
3983
0
                 Py_TYPE(v)->tp_name);
3984
0
    Py_DECREF(v);
3985
0
    return NULL;
3986
0
}
3987
3988
PyAPI_FUNC(PyObject *)
3989
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3990
                           const char *encoding,
3991
                           const char *errors)
3992
0
{
3993
0
    PyObject *v;
3994
3995
0
    if (!PyUnicode_Check(unicode)) {
3996
0
        PyErr_BadArgument();
3997
0
        goto onError;
3998
0
    }
3999
4000
0
    if (encoding == NULL)
4001
0
        encoding = PyUnicode_GetDefaultEncoding();
4002
4003
    /* Encode via the codec registry */
4004
0
    v = PyCodec_Encode(unicode, encoding, errors);
4005
0
    if (v == NULL)
4006
0
        goto onError;
4007
0
    if (!PyUnicode_Check(v)) {
4008
0
        PyErr_Format(PyExc_TypeError,
4009
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
4010
0
                     "use codecs.encode() to encode to arbitrary types",
4011
0
                     encoding,
4012
0
                     Py_TYPE(v)->tp_name);
4013
0
        Py_DECREF(v);
4014
0
        goto onError;
4015
0
    }
4016
0
    return v;
4017
4018
0
  onError:
4019
0
    return NULL;
4020
0
}
4021
4022
static PyObject*
4023
unicode_decode_locale(const char *str, Py_ssize_t len,
4024
                      _Py_error_handler errors, int current_locale)
4025
17.1k
{
4026
17.1k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
4027
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4028
0
        return NULL;
4029
0
    }
4030
4031
17.1k
    wchar_t *wstr;
4032
17.1k
    size_t wlen;
4033
17.1k
    const char *reason;
4034
17.1k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
4035
17.1k
                                 current_locale, errors);
4036
17.1k
    if (res != 0) {
4037
0
        if (res == -2) {
4038
0
            PyObject *exc;
4039
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
4040
0
                                        "locale", str, len,
4041
0
                                        (Py_ssize_t)wlen,
4042
0
                                        (Py_ssize_t)(wlen + 1),
4043
0
                                        reason);
4044
0
            if (exc != NULL) {
4045
0
                PyCodec_StrictErrors(exc);
4046
0
                Py_DECREF(exc);
4047
0
            }
4048
0
        }
4049
0
        else if (res == -3) {
4050
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4051
0
        }
4052
0
        else {
4053
0
            PyErr_NoMemory();
4054
0
        }
4055
0
        return NULL;
4056
0
    }
4057
4058
17.1k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
4059
17.1k
    PyMem_RawFree(wstr);
4060
17.1k
    return unicode;
4061
17.1k
}
4062
4063
PyObject*
4064
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4065
                              const char *errors)
4066
0
{
4067
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4068
0
    return unicode_decode_locale(str, len, error_handler, 1);
4069
0
}
4070
4071
PyObject*
4072
PyUnicode_DecodeLocale(const char *str, const char *errors)
4073
11.9k
{
4074
11.9k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
4075
11.9k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4076
11.9k
    return unicode_decode_locale(str, size, error_handler, 1);
4077
11.9k
}
4078
4079
4080
PyObject*
4081
0
PyUnicode_DecodeFSDefault(const char *s) {
4082
0
    Py_ssize_t size = (Py_ssize_t)strlen(s);
4083
0
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
4084
0
}
4085
4086
PyObject*
4087
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4088
6.72k
{
4089
6.72k
    PyInterpreterState *interp = _PyInterpreterState_GET();
4090
6.72k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4091
6.72k
    if (fs_codec->utf8) {
4092
1.59k
        return unicode_decode_utf8(s, size,
4093
1.59k
                                   fs_codec->error_handler,
4094
1.59k
                                   fs_codec->errors,
4095
1.59k
                                   NULL);
4096
1.59k
    }
4097
5.13k
#ifndef _Py_FORCE_UTF8_FS_ENCODING
4098
5.13k
    else if (fs_codec->encoding) {
4099
0
        return PyUnicode_Decode(s, size,
4100
0
                                fs_codec->encoding,
4101
0
                                fs_codec->errors);
4102
0
    }
4103
5.13k
#endif
4104
5.13k
    else {
4105
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
4106
           machinery is not ready and so cannot be used:
4107
           use mbstowcs() in this case. */
4108
5.13k
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4109
5.13k
        const wchar_t *filesystem_errors = config->filesystem_errors;
4110
5.13k
        assert(filesystem_errors != NULL);
4111
5.13k
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4112
5.13k
        assert(errors != _Py_ERROR_UNKNOWN);
4113
#ifdef _Py_FORCE_UTF8_FS_ENCODING
4114
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
4115
#else
4116
5.13k
        return unicode_decode_locale(s, size, errors, 0);
4117
5.13k
#endif
4118
5.13k
    }
4119
6.72k
}
4120
4121
4122
int
4123
PyUnicode_FSConverter(PyObject* arg, void* addr)
4124
12.6k
{
4125
12.6k
    PyObject *path = NULL;
4126
12.6k
    PyObject *output = NULL;
4127
12.6k
    Py_ssize_t size;
4128
12.6k
    const char *data;
4129
12.6k
    if (arg == NULL) {
4130
0
        Py_DECREF(*(PyObject**)addr);
4131
0
        *(PyObject**)addr = NULL;
4132
0
        return 1;
4133
0
    }
4134
12.6k
    path = PyOS_FSPath(arg);
4135
12.6k
    if (path == NULL) {
4136
0
        return 0;
4137
0
    }
4138
12.6k
    if (PyBytes_Check(path)) {
4139
0
        output = path;
4140
0
    }
4141
12.6k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4142
12.6k
        output = PyUnicode_EncodeFSDefault(path);
4143
12.6k
        Py_DECREF(path);
4144
12.6k
        if (!output) {
4145
0
            return 0;
4146
0
        }
4147
12.6k
        assert(PyBytes_Check(output));
4148
12.6k
    }
4149
4150
12.6k
    size = PyBytes_GET_SIZE(output);
4151
12.6k
    data = PyBytes_AS_STRING(output);
4152
12.6k
    if ((size_t)size != strlen(data)) {
4153
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4154
0
        Py_DECREF(output);
4155
0
        return 0;
4156
0
    }
4157
12.6k
    *(PyObject**)addr = output;
4158
12.6k
    return Py_CLEANUP_SUPPORTED;
4159
12.6k
}
4160
4161
4162
int
4163
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4164
22.3k
{
4165
22.3k
    if (arg == NULL) {
4166
0
        Py_DECREF(*(PyObject**)addr);
4167
0
        *(PyObject**)addr = NULL;
4168
0
        return 1;
4169
0
    }
4170
4171
22.3k
    PyObject *path = PyOS_FSPath(arg);
4172
22.3k
    if (path == NULL) {
4173
0
        return 0;
4174
0
    }
4175
4176
22.3k
    PyObject *output = NULL;
4177
22.3k
    if (PyUnicode_Check(path)) {
4178
22.3k
        output = path;
4179
22.3k
    }
4180
0
    else if (PyBytes_Check(path)) {
4181
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4182
0
                                                  PyBytes_GET_SIZE(path));
4183
0
        Py_DECREF(path);
4184
0
        if (!output) {
4185
0
            return 0;
4186
0
        }
4187
0
    }
4188
0
    else {
4189
0
        PyErr_Format(PyExc_TypeError,
4190
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4191
0
                     Py_TYPE(arg)->tp_name);
4192
0
        Py_DECREF(path);
4193
0
        return 0;
4194
0
    }
4195
4196
22.3k
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4197
22.3k
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4198
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4199
0
        Py_DECREF(output);
4200
0
        return 0;
4201
0
    }
4202
22.3k
    *(PyObject**)addr = output;
4203
22.3k
    return Py_CLEANUP_SUPPORTED;
4204
22.3k
}
4205
4206
4207
static int unicode_fill_utf8(PyObject *unicode);
4208
4209
4210
static int
4211
unicode_ensure_utf8(PyObject *unicode)
4212
19.7M
{
4213
19.7M
    int err = 0;
4214
19.7M
    if (PyUnicode_UTF8(unicode) == NULL) {
4215
152k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4216
152k
        if (PyUnicode_UTF8(unicode) == NULL) {
4217
152k
            err = unicode_fill_utf8(unicode);
4218
152k
        }
4219
152k
        Py_END_CRITICAL_SECTION();
4220
152k
    }
4221
19.7M
    return err;
4222
19.7M
}
4223
4224
const char *
4225
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4226
19.7M
{
4227
19.7M
    if (!PyUnicode_Check(unicode)) {
4228
0
        PyErr_BadArgument();
4229
0
        if (psize) {
4230
0
            *psize = -1;
4231
0
        }
4232
0
        return NULL;
4233
0
    }
4234
4235
19.7M
    if (unicode_ensure_utf8(unicode) == -1) {
4236
206
        if (psize) {
4237
206
            *psize = -1;
4238
206
        }
4239
206
        return NULL;
4240
206
    }
4241
4242
19.7M
    if (psize) {
4243
19.6M
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4244
19.6M
    }
4245
19.7M
    return PyUnicode_UTF8(unicode);
4246
19.7M
}
4247
4248
const char *
4249
PyUnicode_AsUTF8(PyObject *unicode)
4250
68.5k
{
4251
68.5k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4252
68.5k
}
4253
4254
const char *
4255
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4256
1.57M
{
4257
1.57M
    Py_ssize_t size;
4258
1.57M
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4259
1.57M
    if (s && strlen(s) != (size_t)size) {
4260
160
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4261
160
        return NULL;
4262
160
    }
4263
1.57M
    return s;
4264
1.57M
}
4265
4266
/*
4267
PyUnicode_GetSize() has been deprecated since Python 3.3
4268
because it returned length of Py_UNICODE.
4269
4270
But this function is part of stable abi, because it doesn't
4271
include Py_UNICODE in signature and it was not excluded from
4272
stable ABI in PEP 384.
4273
*/
4274
PyAPI_FUNC(Py_ssize_t)
4275
PyUnicode_GetSize(PyObject *unicode)
4276
0
{
4277
0
    PyErr_SetString(PyExc_RuntimeError,
4278
0
                    "PyUnicode_GetSize has been removed.");
4279
0
    return -1;
4280
0
}
4281
4282
Py_ssize_t
4283
PyUnicode_GetLength(PyObject *unicode)
4284
33.1k
{
4285
33.1k
    if (!PyUnicode_Check(unicode)) {
4286
0
        PyErr_BadArgument();
4287
0
        return -1;
4288
0
    }
4289
33.1k
    return PyUnicode_GET_LENGTH(unicode);
4290
33.1k
}
4291
4292
Py_UCS4
4293
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4294
29
{
4295
29
    const void *data;
4296
29
    int kind;
4297
4298
29
    if (!PyUnicode_Check(unicode)) {
4299
0
        PyErr_BadArgument();
4300
0
        return (Py_UCS4)-1;
4301
0
    }
4302
29
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4303
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4304
0
        return (Py_UCS4)-1;
4305
0
    }
4306
29
    data = PyUnicode_DATA(unicode);
4307
29
    kind = PyUnicode_KIND(unicode);
4308
29
    return PyUnicode_READ(kind, data, index);
4309
29
}
4310
4311
int
4312
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4313
0
{
4314
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4315
0
        PyErr_BadArgument();
4316
0
        return -1;
4317
0
    }
4318
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4319
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4320
0
        return -1;
4321
0
    }
4322
0
    if (unicode_check_modifiable(unicode))
4323
0
        return -1;
4324
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4325
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4326
0
        return -1;
4327
0
    }
4328
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4329
0
                    index, ch);
4330
0
    return 0;
4331
0
}
4332
4333
const char *
4334
PyUnicode_GetDefaultEncoding(void)
4335
0
{
4336
0
    return "utf-8";
4337
0
}
4338
4339
/* create or adjust a UnicodeDecodeError */
4340
static void
4341
make_decode_exception(PyObject **exceptionObject,
4342
                      const char *encoding,
4343
                      const char *input, Py_ssize_t length,
4344
                      Py_ssize_t startpos, Py_ssize_t endpos,
4345
                      const char *reason)
4346
281k
{
4347
281k
    if (*exceptionObject == NULL) {
4348
74.7k
        *exceptionObject = PyUnicodeDecodeError_Create(
4349
74.7k
            encoding, input, length, startpos, endpos, reason);
4350
74.7k
    }
4351
206k
    else {
4352
206k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4353
0
            goto onError;
4354
206k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4355
0
            goto onError;
4356
206k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4357
0
            goto onError;
4358
206k
    }
4359
281k
    return;
4360
4361
281k
onError:
4362
0
    Py_CLEAR(*exceptionObject);
4363
0
}
4364
4365
#ifdef MS_WINDOWS
4366
static int
4367
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4368
{
4369
    if (newsize > *size) {
4370
        wchar_t *newbuf = *buf;
4371
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4372
            PyErr_NoMemory();
4373
            return -1;
4374
        }
4375
        *buf = newbuf;
4376
    }
4377
    *size = newsize;
4378
    return 0;
4379
}
4380
4381
/* error handling callback helper:
4382
   build arguments, call the callback and check the arguments,
4383
   if no exception occurred, copy the replacement to the output
4384
   and adjust various state variables.
4385
   return 0 on success, -1 on error
4386
*/
4387
4388
static int
4389
unicode_decode_call_errorhandler_wchar(
4390
    const char *errors, PyObject **errorHandler,
4391
    const char *encoding, const char *reason,
4392
    const char **input, const char **inend, Py_ssize_t *startinpos,
4393
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4394
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4395
{
4396
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4397
4398
    PyObject *restuple = NULL;
4399
    PyObject *repunicode = NULL;
4400
    Py_ssize_t outsize;
4401
    Py_ssize_t insize;
4402
    Py_ssize_t requiredsize;
4403
    Py_ssize_t newpos;
4404
    PyObject *inputobj = NULL;
4405
    Py_ssize_t repwlen;
4406
4407
    if (*errorHandler == NULL) {
4408
        *errorHandler = PyCodec_LookupError(errors);
4409
        if (*errorHandler == NULL)
4410
            goto onError;
4411
    }
4412
4413
    make_decode_exception(exceptionObject,
4414
        encoding,
4415
        *input, *inend - *input,
4416
        *startinpos, *endinpos,
4417
        reason);
4418
    if (*exceptionObject == NULL)
4419
        goto onError;
4420
4421
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4422
    if (restuple == NULL)
4423
        goto onError;
4424
    if (!PyTuple_Check(restuple)) {
4425
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4426
        goto onError;
4427
    }
4428
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4429
        goto onError;
4430
4431
    /* Copy back the bytes variables, which might have been modified by the
4432
       callback */
4433
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4434
    if (!inputobj)
4435
        goto onError;
4436
    *input = PyBytes_AS_STRING(inputobj);
4437
    insize = PyBytes_GET_SIZE(inputobj);
4438
    *inend = *input + insize;
4439
    /* we can DECREF safely, as the exception has another reference,
4440
       so the object won't go away. */
4441
    Py_DECREF(inputobj);
4442
4443
    if (newpos<0)
4444
        newpos = insize+newpos;
4445
    if (newpos<0 || newpos>insize) {
4446
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4447
        goto onError;
4448
    }
4449
4450
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4451
    if (repwlen < 0)
4452
        goto onError;
4453
    repwlen--;
4454
    /* need more space? (at least enough for what we
4455
       have+the replacement+the rest of the string (starting
4456
       at the new input position), so we won't have to check space
4457
       when there are no errors in the rest of the string) */
4458
    requiredsize = *outpos;
4459
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4460
        goto overflow;
4461
    requiredsize += repwlen;
4462
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4463
        goto overflow;
4464
    requiredsize += insize - newpos;
4465
    outsize = *bufsize;
4466
    if (requiredsize > outsize) {
4467
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4468
            requiredsize = 2*outsize;
4469
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4470
            goto onError;
4471
        }
4472
    }
4473
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4474
    *outpos += repwlen;
4475
    *endinpos = newpos;
4476
    *inptr = *input + newpos;
4477
4478
    /* we made it! */
4479
    Py_DECREF(restuple);
4480
    return 0;
4481
4482
  overflow:
4483
    PyErr_SetString(PyExc_OverflowError,
4484
                    "decoded result is too long for a Python string");
4485
4486
  onError:
4487
    Py_XDECREF(restuple);
4488
    return -1;
4489
}
4490
#endif   /* MS_WINDOWS */
4491
4492
static int
4493
unicode_decode_call_errorhandler_writer(
4494
    const char *errors, PyObject **errorHandler,
4495
    const char *encoding, const char *reason,
4496
    const char **input, const char **inend, Py_ssize_t *startinpos,
4497
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4498
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4499
281k
{
4500
281k
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4501
4502
281k
    PyObject *restuple = NULL;
4503
281k
    PyObject *repunicode = NULL;
4504
281k
    Py_ssize_t insize;
4505
281k
    Py_ssize_t newpos;
4506
281k
    Py_ssize_t replen;
4507
281k
    Py_ssize_t remain;
4508
281k
    PyObject *inputobj = NULL;
4509
281k
    int need_to_grow = 0;
4510
281k
    const char *new_inptr;
4511
4512
281k
    if (*errorHandler == NULL) {
4513
74.7k
        *errorHandler = PyCodec_LookupError(errors);
4514
74.7k
        if (*errorHandler == NULL)
4515
0
            goto onError;
4516
74.7k
    }
4517
4518
281k
    make_decode_exception(exceptionObject,
4519
281k
        encoding,
4520
281k
        *input, *inend - *input,
4521
281k
        *startinpos, *endinpos,
4522
281k
        reason);
4523
281k
    if (*exceptionObject == NULL)
4524
0
        goto onError;
4525
4526
281k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4527
281k
    if (restuple == NULL)
4528
42.8k
        goto onError;
4529
238k
    if (!PyTuple_Check(restuple)) {
4530
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4531
0
        goto onError;
4532
0
    }
4533
238k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4534
0
        goto onError;
4535
4536
    /* Copy back the bytes variables, which might have been modified by the
4537
       callback */
4538
238k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4539
238k
    if (!inputobj)
4540
0
        goto onError;
4541
238k
    remain = *inend - *input - *endinpos;
4542
238k
    *input = PyBytes_AS_STRING(inputobj);
4543
238k
    insize = PyBytes_GET_SIZE(inputobj);
4544
238k
    *inend = *input + insize;
4545
    /* we can DECREF safely, as the exception has another reference,
4546
       so the object won't go away. */
4547
238k
    Py_DECREF(inputobj);
4548
4549
238k
    if (newpos<0)
4550
0
        newpos = insize+newpos;
4551
238k
    if (newpos<0 || newpos>insize) {
4552
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4553
0
        goto onError;
4554
0
    }
4555
4556
238k
    replen = PyUnicode_GET_LENGTH(repunicode);
4557
238k
    if (replen > 1) {
4558
18.3k
        writer->min_length += replen - 1;
4559
18.3k
        need_to_grow = 1;
4560
18.3k
    }
4561
238k
    new_inptr = *input + newpos;
4562
238k
    if (*inend - new_inptr > remain) {
4563
        /* We don't know the decoding algorithm here so we make the worst
4564
           assumption that one byte decodes to one unicode character.
4565
           If unfortunately one byte could decode to more unicode characters,
4566
           the decoder may write out-of-bound then.  Is it possible for the
4567
           algorithms using this function? */
4568
6.26k
        writer->min_length += *inend - new_inptr - remain;
4569
6.26k
        need_to_grow = 1;
4570
6.26k
    }
4571
238k
    if (need_to_grow) {
4572
18.5k
        writer->overallocate = 1;
4573
18.5k
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4574
18.5k
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4575
0
            goto onError;
4576
18.5k
    }
4577
238k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4578
0
        goto onError;
4579
4580
238k
    *endinpos = newpos;
4581
238k
    *inptr = new_inptr;
4582
4583
    /* we made it! */
4584
238k
    Py_DECREF(restuple);
4585
238k
    return 0;
4586
4587
42.8k
  onError:
4588
42.8k
    Py_XDECREF(restuple);
4589
42.8k
    return -1;
4590
238k
}
4591
4592
/* --- UTF-7 Codec -------------------------------------------------------- */
4593
4594
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4595
4596
/* Three simple macros defining base-64. */
4597
4598
/* Is c a base-64 character? */
4599
4600
#define IS_BASE64(c) \
4601
263k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4602
263k
     ((c) >= 'a' && (c) <= 'z') ||     \
4603
263k
     ((c) >= '0' && (c) <= '9') ||     \
4604
263k
     (c) == '+' || (c) == '/')
4605
4606
/* given that c is a base-64 character, what is its base-64 value? */
4607
4608
#define FROM_BASE64(c)                                                  \
4609
218k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4610
218k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4611
169k
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4612
100k
     (c) == '+' ? 62 : 63)
4613
4614
/* What is the base-64 character of the bottom 6 bits of n? */
4615
4616
#define TO_BASE64(n)  \
4617
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4618
4619
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4620
 * decoded as itself.  We are permissive on decoding; the only ASCII
4621
 * byte not decoding to itself is the + which begins a base64
4622
 * string. */
4623
4624
#define DECODE_DIRECT(c)                                \
4625
7.56M
    ((c) <= 127 && (c) != '+')
4626
4627
/* The UTF-7 encoder treats ASCII characters differently according to
4628
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4629
 * the above).  See RFC2152.  This array identifies these different
4630
 * sets:
4631
 * 0 : "Set D"
4632
 *     alphanumeric and '(),-./:?
4633
 * 1 : "Set O"
4634
 *     !"#$%&*;<=>@[]^_`{|}
4635
 * 2 : "whitespace"
4636
 *     ht nl cr sp
4637
 * 3 : special (must be base64 encoded)
4638
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4639
 */
4640
4641
static
4642
char utf7_category[128] = {
4643
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4644
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4645
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4646
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4647
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4648
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4649
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4650
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4651
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4652
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4653
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4654
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4655
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4656
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4657
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4658
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4659
};
4660
4661
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4662
 * answer depends on whether we are encoding set O as itself, and also
4663
 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4664
 * clear that the answers to these questions vary between
4665
 * applications, so this code needs to be flexible.  */
4666
4667
#define ENCODE_DIRECT(c, directO, directWS)             \
4668
0
    ((c) < 128 && (c) > 0 &&                            \
4669
0
     ((utf7_category[(c)] == 0) ||                      \
4670
0
      (directWS && (utf7_category[(c)] == 2)) ||        \
4671
0
      (directO && (utf7_category[(c)] == 1))))
4672
4673
PyObject *
4674
PyUnicode_DecodeUTF7(const char *s,
4675
                     Py_ssize_t size,
4676
                     const char *errors)
4677
0
{
4678
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4679
0
}
4680
4681
/* The decoder.  The only state we preserve is our read position,
4682
 * i.e. how many characters we have consumed.  So if we end in the
4683
 * middle of a shift sequence we have to back off the read position
4684
 * and the output to the beginning of the sequence, otherwise we lose
4685
 * all the shift state (seen bits, number of bits seen, high
4686
 * surrogate). */
4687
4688
PyObject *
4689
PyUnicode_DecodeUTF7Stateful(const char *s,
4690
                             Py_ssize_t size,
4691
                             const char *errors,
4692
                             Py_ssize_t *consumed)
4693
27.3k
{
4694
27.3k
    const char *starts = s;
4695
27.3k
    Py_ssize_t startinpos;
4696
27.3k
    Py_ssize_t endinpos;
4697
27.3k
    const char *e;
4698
27.3k
    _PyUnicodeWriter writer;
4699
27.3k
    const char *errmsg = "";
4700
27.3k
    int inShift = 0;
4701
27.3k
    Py_ssize_t shiftOutStart;
4702
27.3k
    unsigned int base64bits = 0;
4703
27.3k
    unsigned long base64buffer = 0;
4704
27.3k
    Py_UCS4 surrogate = 0;
4705
27.3k
    PyObject *errorHandler = NULL;
4706
27.3k
    PyObject *exc = NULL;
4707
4708
27.3k
    if (size == 0) {
4709
0
        if (consumed)
4710
0
            *consumed = 0;
4711
0
        _Py_RETURN_UNICODE_EMPTY();
4712
0
    }
4713
4714
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4715
27.3k
    _PyUnicodeWriter_Init(&writer);
4716
27.3k
    writer.min_length = size;
4717
4718
27.3k
    shiftOutStart = 0;
4719
27.3k
    e = s + size;
4720
4721
7.84M
    while (s < e) {
4722
7.82M
        Py_UCS4 ch;
4723
7.82M
      restart:
4724
7.82M
        ch = (unsigned char) *s;
4725
4726
7.82M
        if (inShift) { /* in a base-64 section */
4727
237k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4728
218k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4729
218k
                base64bits += 6;
4730
218k
                s++;
4731
218k
                if (base64bits >= 16) {
4732
                    /* we have enough bits for a UTF-16 value */
4733
75.2k
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4734
75.2k
                    base64bits -= 16;
4735
75.2k
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4736
75.2k
                    assert(outCh <= 0xffff);
4737
75.2k
                    if (surrogate) {
4738
                        /* expecting a second surrogate */
4739
8.08k
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4740
3.08k
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4741
3.08k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4742
0
                                goto onError;
4743
3.08k
                            surrogate = 0;
4744
3.08k
                            continue;
4745
3.08k
                        }
4746
5.00k
                        else {
4747
5.00k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4748
0
                                goto onError;
4749
5.00k
                            surrogate = 0;
4750
5.00k
                        }
4751
8.08k
                    }
4752
72.1k
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4753
                        /* first surrogate */
4754
11.4k
                        surrogate = outCh;
4755
11.4k
                    }
4756
60.7k
                    else {
4757
60.7k
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4758
0
                            goto onError;
4759
60.7k
                    }
4760
72.1k
                }
4761
218k
            }
4762
19.1k
            else { /* now leaving a base-64 section */
4763
19.1k
                inShift = 0;
4764
19.1k
                if (base64bits > 0) { /* left-over bits */
4765
16.0k
                    if (base64bits >= 6) {
4766
                        /* We've seen at least one base-64 character */
4767
8.17k
                        s++;
4768
8.17k
                        errmsg = "partial character in shift sequence";
4769
8.17k
                        goto utf7Error;
4770
8.17k
                    }
4771
7.87k
                    else {
4772
                        /* Some bits remain; they should be zero */
4773
7.87k
                        if (base64buffer != 0) {
4774
2.06k
                            s++;
4775
2.06k
                            errmsg = "non-zero padding bits in shift sequence";
4776
2.06k
                            goto utf7Error;
4777
2.06k
                        }
4778
7.87k
                    }
4779
16.0k
                }
4780
8.90k
                if (surrogate && DECODE_DIRECT(ch)) {
4781
2.45k
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4782
0
                        goto onError;
4783
2.45k
                }
4784
8.90k
                surrogate = 0;
4785
8.90k
                if (ch == '-') {
4786
                    /* '-' is absorbed; other terminating
4787
                       characters are preserved */
4788
2.44k
                    s++;
4789
2.44k
                }
4790
8.90k
            }
4791
237k
        }
4792
7.59M
        else if ( ch == '+' ) {
4793
28.2k
            startinpos = s-starts;
4794
28.2k
            s++; /* consume '+' */
4795
28.2k
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4796
2.36k
                s++;
4797
2.36k
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4798
0
                    goto onError;
4799
2.36k
            }
4800
25.9k
            else if (s < e && !IS_BASE64(*s)) {
4801
3.69k
                s++;
4802
3.69k
                errmsg = "ill-formed sequence";
4803
3.69k
                goto utf7Error;
4804
3.69k
            }
4805
22.2k
            else { /* begin base64-encoded section */
4806
22.2k
                inShift = 1;
4807
22.2k
                surrogate = 0;
4808
22.2k
                shiftOutStart = writer.pos;
4809
22.2k
                base64bits = 0;
4810
22.2k
                base64buffer = 0;
4811
22.2k
            }
4812
28.2k
        }
4813
7.56M
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4814
7.46M
            s++;
4815
7.46M
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4816
0
                goto onError;
4817
7.46M
        }
4818
96.4k
        else {
4819
96.4k
            startinpos = s-starts;
4820
96.4k
            s++;
4821
96.4k
            errmsg = "unexpected special character";
4822
96.4k
            goto utf7Error;
4823
96.4k
        }
4824
7.71M
        continue;
4825
7.71M
utf7Error:
4826
110k
        endinpos = s-starts;
4827
110k
        if (unicode_decode_call_errorhandler_writer(
4828
110k
                errors, &errorHandler,
4829
110k
                "utf7", errmsg,
4830
110k
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4831
110k
                &writer))
4832
11.8k
            goto onError;
4833
110k
    }
4834
4835
    /* end of string */
4836
4837
15.5k
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4838
        /* if we're in an inconsistent state, that's an error */
4839
3.08k
        inShift = 0;
4840
3.08k
        if (surrogate ||
4841
3.08k
                (base64bits >= 6) ||
4842
3.08k
                (base64bits > 0 && base64buffer != 0)) {
4843
2.07k
            endinpos = size;
4844
2.07k
            if (unicode_decode_call_errorhandler_writer(
4845
2.07k
                    errors, &errorHandler,
4846
2.07k
                    "utf7", "unterminated shift sequence",
4847
2.07k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4848
2.07k
                    &writer))
4849
1.71k
                goto onError;
4850
352
            if (s < e)
4851
0
                goto restart;
4852
352
        }
4853
3.08k
    }
4854
4855
    /* return state */
4856
13.8k
    if (consumed) {
4857
0
        if (inShift) {
4858
0
            *consumed = startinpos;
4859
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4860
0
                PyObject *result = PyUnicode_FromKindAndData(
4861
0
                        writer.kind, writer.data, shiftOutStart);
4862
0
                Py_XDECREF(errorHandler);
4863
0
                Py_XDECREF(exc);
4864
0
                _PyUnicodeWriter_Dealloc(&writer);
4865
0
                return result;
4866
0
            }
4867
0
            writer.pos = shiftOutStart; /* back off output */
4868
0
        }
4869
0
        else {
4870
0
            *consumed = s-starts;
4871
0
        }
4872
0
    }
4873
4874
13.8k
    Py_XDECREF(errorHandler);
4875
13.8k
    Py_XDECREF(exc);
4876
13.8k
    return _PyUnicodeWriter_Finish(&writer);
4877
4878
13.5k
  onError:
4879
13.5k
    Py_XDECREF(errorHandler);
4880
13.5k
    Py_XDECREF(exc);
4881
13.5k
    _PyUnicodeWriter_Dealloc(&writer);
4882
13.5k
    return NULL;
4883
13.8k
}
4884
4885
4886
PyObject *
4887
_PyUnicode_EncodeUTF7(PyObject *str,
4888
                      int base64SetO,
4889
                      int base64WhiteSpace,
4890
                      const char *errors)
4891
0
{
4892
0
    int kind;
4893
0
    const void *data;
4894
0
    Py_ssize_t len;
4895
0
    PyObject *v;
4896
0
    int inShift = 0;
4897
0
    Py_ssize_t i;
4898
0
    unsigned int base64bits = 0;
4899
0
    unsigned long base64buffer = 0;
4900
0
    char * out;
4901
0
    const char * start;
4902
4903
0
    kind = PyUnicode_KIND(str);
4904
0
    data = PyUnicode_DATA(str);
4905
0
    len = PyUnicode_GET_LENGTH(str);
4906
4907
0
    if (len == 0)
4908
0
        return PyBytes_FromStringAndSize(NULL, 0);
4909
4910
    /* It might be possible to tighten this worst case */
4911
0
    if (len > PY_SSIZE_T_MAX / 8)
4912
0
        return PyErr_NoMemory();
4913
0
    v = PyBytes_FromStringAndSize(NULL, len * 8);
4914
0
    if (v == NULL)
4915
0
        return NULL;
4916
4917
0
    start = out = PyBytes_AS_STRING(v);
4918
0
    for (i = 0; i < len; ++i) {
4919
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4920
4921
0
        if (inShift) {
4922
0
            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4923
                /* shifting out */
4924
0
                if (base64bits) { /* output remaining bits */
4925
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4926
0
                    base64buffer = 0;
4927
0
                    base64bits = 0;
4928
0
                }
4929
0
                inShift = 0;
4930
                /* Characters not in the BASE64 set implicitly unshift the sequence
4931
                   so no '-' is required, except if the character is itself a '-' */
4932
0
                if (IS_BASE64(ch) || ch == '-') {
4933
0
                    *out++ = '-';
4934
0
                }
4935
0
                *out++ = (char) ch;
4936
0
            }
4937
0
            else {
4938
0
                goto encode_char;
4939
0
            }
4940
0
        }
4941
0
        else { /* not in a shift sequence */
4942
0
            if (ch == '+') {
4943
0
                *out++ = '+';
4944
0
                        *out++ = '-';
4945
0
            }
4946
0
            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4947
0
                *out++ = (char) ch;
4948
0
            }
4949
0
            else {
4950
0
                *out++ = '+';
4951
0
                inShift = 1;
4952
0
                goto encode_char;
4953
0
            }
4954
0
        }
4955
0
        continue;
4956
0
encode_char:
4957
0
        if (ch >= 0x10000) {
4958
0
            assert(ch <= MAX_UNICODE);
4959
4960
            /* code first surrogate */
4961
0
            base64bits += 16;
4962
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4963
0
            while (base64bits >= 6) {
4964
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4965
0
                base64bits -= 6;
4966
0
            }
4967
            /* prepare second surrogate */
4968
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4969
0
        }
4970
0
        base64bits += 16;
4971
0
        base64buffer = (base64buffer << 16) | ch;
4972
0
        while (base64bits >= 6) {
4973
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4974
0
            base64bits -= 6;
4975
0
        }
4976
0
    }
4977
0
    if (base64bits)
4978
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4979
0
    if (inShift)
4980
0
        *out++ = '-';
4981
0
    if (_PyBytes_Resize(&v, out - start) < 0)
4982
0
        return NULL;
4983
0
    return v;
4984
0
}
4985
4986
#undef IS_BASE64
4987
#undef FROM_BASE64
4988
#undef TO_BASE64
4989
#undef DECODE_DIRECT
4990
#undef ENCODE_DIRECT
4991
4992
/* --- UTF-8 Codec -------------------------------------------------------- */
4993
4994
PyObject *
4995
PyUnicode_DecodeUTF8(const char *s,
4996
                     Py_ssize_t size,
4997
                     const char *errors)
4998
2.38M
{
4999
2.38M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5000
2.38M
}
5001
5002
#include "stringlib/asciilib.h"
5003
#include "stringlib/codecs.h"
5004
#include "stringlib/undef.h"
5005
5006
#include "stringlib/ucs1lib.h"
5007
#include "stringlib/codecs.h"
5008
#include "stringlib/undef.h"
5009
5010
#include "stringlib/ucs2lib.h"
5011
#include "stringlib/codecs.h"
5012
#include "stringlib/undef.h"
5013
5014
#include "stringlib/ucs4lib.h"
5015
#include "stringlib/codecs.h"
5016
#include "stringlib/undef.h"
5017
5018
#if (SIZEOF_SIZE_T == 8)
5019
/* Mask to quickly check whether a C 'size_t' contains a
5020
   non-ASCII, UTF8-encoded char. */
5021
108M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
5022
// used to count codepoints in UTF-8 string.
5023
235M
# define VECTOR_0101     0x0101010101010101ULL
5024
1.89M
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
5025
#elif (SIZEOF_SIZE_T == 4)
5026
# define ASCII_CHAR_MASK 0x80808080U
5027
# define VECTOR_0101     0x01010101U
5028
# define VECTOR_00FF     0x00ff00ffU
5029
#else
5030
# error C 'size_t' size should be either 4 or 8!
5031
#endif
5032
5033
#if (defined(__clang__) || defined(__GNUC__))
5034
#define HAVE_CTZ 1
5035
static inline unsigned int
5036
ctz(size_t v)
5037
384k
{
5038
384k
    return __builtin_ctzll((unsigned long long)v);
5039
384k
}
5040
#elif defined(_MSC_VER)
5041
#define HAVE_CTZ 1
5042
static inline unsigned int
5043
ctz(size_t v)
5044
{
5045
    unsigned long pos;
5046
#if SIZEOF_SIZE_T == 4
5047
    _BitScanForward(&pos, v);
5048
#else
5049
    _BitScanForward64(&pos, v);
5050
#endif /* SIZEOF_SIZE_T */
5051
    return pos;
5052
}
5053
#else
5054
#define HAVE_CTZ 0
5055
#endif
5056
5057
#if HAVE_CTZ && PY_LITTLE_ENDIAN
5058
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
5059
static size_t
5060
load_unaligned(const unsigned char *p, size_t size)
5061
13.3M
{
5062
13.3M
    union {
5063
13.3M
        size_t s;
5064
13.3M
        unsigned char b[SIZEOF_SIZE_T];
5065
13.3M
    } u;
5066
13.3M
    u.s = 0;
5067
    // This switch statement assumes little endian because:
5068
    // * union is faster than bitwise or and shift.
5069
    // * big endian machine is rare and hard to maintain.
5070
13.3M
    switch (size) {
5071
0
    default:
5072
0
#if SIZEOF_SIZE_T == 8
5073
0
    case 8:
5074
0
        u.b[7] = p[7];
5075
0
        _Py_FALLTHROUGH;
5076
782k
    case 7:
5077
782k
        u.b[6] = p[6];
5078
782k
        _Py_FALLTHROUGH;
5079
2.80M
    case 6:
5080
2.80M
        u.b[5] = p[5];
5081
2.80M
        _Py_FALLTHROUGH;
5082
3.36M
    case 5:
5083
3.36M
        u.b[4] = p[4];
5084
3.36M
        _Py_FALLTHROUGH;
5085
3.36M
#endif
5086
3.84M
    case 4:
5087
3.84M
        u.b[3] = p[3];
5088
3.84M
        _Py_FALLTHROUGH;
5089
9.45M
    case 3:
5090
9.45M
        u.b[2] = p[2];
5091
9.45M
        _Py_FALLTHROUGH;
5092
12.8M
    case 2:
5093
12.8M
        u.b[1] = p[1];
5094
12.8M
        _Py_FALLTHROUGH;
5095
13.1M
    case 1:
5096
13.1M
        u.b[0] = p[0];
5097
13.1M
        break;
5098
179k
    case 0:
5099
179k
        break;
5100
13.3M
    }
5101
13.3M
    return u.s;
5102
13.3M
}
5103
#endif
5104
5105
/*
5106
 * Find the first non-ASCII character in a byte sequence.
5107
 *
5108
 * This function scans a range of bytes from `start` to `end` and returns the
5109
 * index of the first byte that is not an ASCII character (i.e., has the most
5110
 * significant bit set). If all characters in the range are ASCII, it returns
5111
 * `end - start`.
5112
 */
5113
static Py_ssize_t
5114
find_first_nonascii(const unsigned char *start, const unsigned char *end)
5115
13.5M
{
5116
    // The search is done in `size_t` chunks.
5117
    // The start and end might not be aligned at `size_t` boundaries,
5118
    // so they're handled specially.
5119
5120
13.5M
    const unsigned char *p = start;
5121
5122
13.5M
    if (end - start >= SIZEOF_SIZE_T) {
5123
        // Avoid unaligned read.
5124
3.06M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5125
3.06M
        size_t u;
5126
3.06M
        memcpy(&u, p, sizeof(size_t));
5127
3.06M
        u &= ASCII_CHAR_MASK;
5128
3.06M
        if (u) {
5129
129k
            return (ctz(u) - 7) / 8;
5130
129k
        }
5131
2.93M
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
5132
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
5133
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
5134
        while (p < p2) {
5135
            if (*p & 0x80) {
5136
                return p - start;
5137
            }
5138
            p++;
5139
        }
5140
#endif
5141
5142
2.93M
        const unsigned char *e = end - SIZEOF_SIZE_T;
5143
94.2M
        while (p <= e) {
5144
91.4M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5145
91.4M
            if (u) {
5146
94.0k
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5147
94.0k
                return p - start + (ctz(u) - 7) / 8;
5148
#else
5149
                // big endian and minor compilers are difficult to test.
5150
                // fallback to per byte check.
5151
                break;
5152
#endif
5153
94.0k
            }
5154
91.3M
            p += SIZEOF_SIZE_T;
5155
91.3M
        }
5156
2.93M
    }
5157
13.3M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5158
13.3M
    assert((end - p) < SIZEOF_SIZE_T);
5159
    // we can not use *(const size_t*)p to avoid buffer overrun.
5160
13.3M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5161
13.3M
    if (u) {
5162
160k
        return p - start + (ctz(u) - 7) / 8;
5163
160k
    }
5164
13.1M
    return end - start;
5165
#else
5166
    while (p < end) {
5167
        if (*p & 0x80) {
5168
            break;
5169
        }
5170
        p++;
5171
    }
5172
    return p - start;
5173
#endif
5174
13.3M
}
5175
5176
static inline int
5177
scalar_utf8_start_char(unsigned int ch)
5178
350k
{
5179
    // 0xxxxxxx or 11xxxxxx are first byte.
5180
350k
    return (~ch >> 7 | ch >> 6) & 1;
5181
350k
}
5182
5183
static inline size_t
5184
vector_utf8_start_chars(size_t v)
5185
235M
{
5186
235M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5187
235M
}
5188
5189
5190
// Count the number of UTF-8 code points in a given byte sequence.
5191
static Py_ssize_t
5192
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5193
96.6k
{
5194
96.6k
    Py_ssize_t len = 0;
5195
5196
96.6k
    if (end - s >= SIZEOF_SIZE_T) {
5197
48.8k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5198
18.7k
            len += scalar_utf8_start_char(*s++);
5199
18.7k
        }
5200
5201
977k
        while (s + SIZEOF_SIZE_T <= end) {
5202
947k
            const unsigned char *e = end;
5203
947k
            if (e - s > SIZEOF_SIZE_T * 255) {
5204
919k
                e = s + SIZEOF_SIZE_T * 255;
5205
919k
            }
5206
947k
            Py_ssize_t vstart = 0;
5207
236M
            while (s + SIZEOF_SIZE_T <= e) {
5208
235M
                size_t v = *(size_t*)s;
5209
235M
                size_t vs = vector_utf8_start_chars(v);
5210
235M
                vstart += vs;
5211
235M
                s += SIZEOF_SIZE_T;
5212
235M
            }
5213
947k
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5214
947k
            vstart += vstart >> 16;
5215
947k
#if SIZEOF_SIZE_T == 8
5216
947k
            vstart += vstart >> 32;
5217
947k
#endif
5218
947k
            len += vstart & 0x7ff;
5219
947k
        }
5220
30.0k
    }
5221
428k
    while (s < end) {
5222
331k
        len += scalar_utf8_start_char(*s++);
5223
331k
    }
5224
96.6k
    return len;
5225
96.6k
}
5226
5227
static Py_ssize_t
5228
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5229
5.27M
{
5230
5.27M
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5231
5.27M
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5232
5.27M
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5233
734k
    {
5234
        /* Fast path, see in STRINGLIB(utf8_decode) for
5235
           an explanation. */
5236
734k
        const char *p = start;
5237
734k
        Py_UCS1 *q = dest;
5238
1.64M
        while (p + SIZEOF_SIZE_T <= end) {
5239
1.04M
            size_t value = *(const size_t *) p;
5240
1.04M
            if (value & ASCII_CHAR_MASK)
5241
132k
                break;
5242
911k
            *((size_t *)q) = value;
5243
911k
            p += SIZEOF_SIZE_T;
5244
911k
            q += SIZEOF_SIZE_T;
5245
911k
        }
5246
3.28M
        while (p < end) {
5247
2.70M
            if ((unsigned char)*p & 0x80)
5248
152k
                break;
5249
2.55M
            *q++ = *p++;
5250
2.55M
        }
5251
734k
        return p - start;
5252
734k
    }
5253
4.54M
#endif
5254
4.54M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5255
4.54M
                                         (const unsigned char*)end);
5256
4.54M
    memcpy(dest, start, pos);
5257
4.54M
    return pos;
5258
5.27M
}
5259
5260
static int
5261
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5262
                         const char *starts, const char *s, const char *end,
5263
                         _Py_error_handler error_handler,
5264
                         const char *errors,
5265
                         Py_ssize_t *consumed)
5266
386k
{
5267
386k
    Py_ssize_t startinpos, endinpos;
5268
386k
    const char *errmsg = "";
5269
386k
    PyObject *error_handler_obj = NULL;
5270
386k
    PyObject *exc = NULL;
5271
5272
176M
    while (s < end) {
5273
176M
        Py_UCS4 ch;
5274
176M
        int kind = writer->kind;
5275
5276
176M
        if (kind == PyUnicode_1BYTE_KIND) {
5277
374k
            if (PyUnicode_IS_ASCII(writer->buffer))
5278
287k
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5279
86.5k
            else
5280
86.5k
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5281
175M
        } else if (kind == PyUnicode_2BYTE_KIND) {
5282
95.3M
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5283
95.3M
        } else {
5284
80.3M
            assert(kind == PyUnicode_4BYTE_KIND);
5285
80.3M
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5286
80.3M
        }
5287
5288
176M
        switch (ch) {
5289
309k
        case 0:
5290
309k
            if (s == end || consumed)
5291
287k
                goto End;
5292
22.4k
            errmsg = "unexpected end of data";
5293
22.4k
            startinpos = s - starts;
5294
22.4k
            endinpos = end - starts;
5295
22.4k
            break;
5296
138M
        case 1:
5297
138M
            errmsg = "invalid start byte";
5298
138M
            startinpos = s - starts;
5299
138M
            endinpos = startinpos + 1;
5300
138M
            break;
5301
35.6M
        case 2:
5302
35.6M
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5303
35.6M
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5304
0
            {
5305
                /* Truncated surrogate code in range D800-DFFF */
5306
0
                goto End;
5307
0
            }
5308
35.6M
            _Py_FALLTHROUGH;
5309
36.8M
        case 3:
5310
37.0M
        case 4:
5311
37.0M
            errmsg = "invalid continuation byte";
5312
37.0M
            startinpos = s - starts;
5313
37.0M
            endinpos = startinpos + ch - 1;
5314
37.0M
            break;
5315
276k
        default:
5316
            // ch doesn't fit into kind, so change the buffer kind to write
5317
            // the character
5318
276k
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5319
0
                goto onError;
5320
276k
            continue;
5321
176M
        }
5322
5323
175M
        if (error_handler == _Py_ERROR_UNKNOWN)
5324
108k
            error_handler = _Py_GetErrorHandler(errors);
5325
5326
175M
        switch (error_handler) {
5327
0
        case _Py_ERROR_IGNORE:
5328
0
            s += (endinpos - startinpos);
5329
0
            break;
5330
5331
175M
        case _Py_ERROR_REPLACE:
5332
175M
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5333
0
                goto onError;
5334
175M
            s += (endinpos - startinpos);
5335
175M
            break;
5336
5337
3.33k
        case _Py_ERROR_SURROGATEESCAPE:
5338
3.33k
        {
5339
3.33k
            Py_ssize_t i;
5340
5341
3.33k
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5342
0
                goto onError;
5343
7.08k
            for (i=startinpos; i<endinpos; i++) {
5344
3.75k
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5345
3.75k
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5346
3.75k
                                ch + 0xdc00);
5347
3.75k
                writer->pos++;
5348
3.75k
            }
5349
3.33k
            s += (endinpos - startinpos);
5350
3.33k
            break;
5351
3.33k
        }
5352
5353
3.59k
        default:
5354
3.59k
            if (unicode_decode_call_errorhandler_writer(
5355
3.59k
                    errors, &error_handler_obj,
5356
3.59k
                    "utf-8", errmsg,
5357
3.59k
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5358
3.59k
                    writer)) {
5359
3.59k
                goto onError;
5360
3.59k
            }
5361
5362
0
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5363
0
                return -1;
5364
0
            }
5365
175M
        }
5366
175M
    }
5367
5368
383k
End:
5369
383k
    if (consumed)
5370
1.24k
        *consumed = s - starts;
5371
5372
383k
    Py_XDECREF(error_handler_obj);
5373
383k
    Py_XDECREF(exc);
5374
383k
    return 0;
5375
5376
3.59k
onError:
5377
3.59k
    Py_XDECREF(error_handler_obj);
5378
3.59k
    Py_XDECREF(exc);
5379
3.59k
    return -1;
5380
386k
}
5381
5382
5383
static PyObject *
5384
unicode_decode_utf8(const char *s, Py_ssize_t size,
5385
                    _Py_error_handler error_handler, const char *errors,
5386
                    Py_ssize_t *consumed)
5387
10.8M
{
5388
10.8M
    if (size == 0) {
5389
75.1k
        if (consumed) {
5390
0
            *consumed = 0;
5391
0
        }
5392
75.1k
        _Py_RETURN_UNICODE_EMPTY();
5393
75.1k
    }
5394
5395
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5396
10.8M
    if (size == 1 && (unsigned char)s[0] < 128) {
5397
1.78M
        if (consumed) {
5398
0
            *consumed = 1;
5399
0
        }
5400
1.78M
        return get_latin1_char((unsigned char)s[0]);
5401
1.78M
    }
5402
5403
    // I don't know this check is necessary or not. But there is a test
5404
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5405
9.02M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5406
0
        PyErr_NoMemory();
5407
0
        return NULL;
5408
0
    }
5409
5410
9.02M
    const char *starts = s;
5411
9.02M
    const char *end = s + size;
5412
5413
9.02M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5414
9.02M
    if (pos == size) {  // fast path: ASCII string.
5415
8.68M
        PyObject *u = PyUnicode_New(size, 127);
5416
8.68M
        if (u == NULL) {
5417
0
            return NULL;
5418
0
        }
5419
8.68M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5420
8.68M
        if (consumed) {
5421
0
            *consumed = size;
5422
0
        }
5423
8.68M
        return u;
5424
8.68M
    }
5425
5426
344k
    int maxchr = 127;
5427
344k
    Py_ssize_t maxsize = size;
5428
5429
344k
    unsigned char ch = (unsigned char)(s[pos]);
5430
    // error handler other than strict may remove/replace the invalid byte.
5431
    // consumed != NULL allows 1~3 bytes remainings.
5432
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5433
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5434
    // reallocation and copy.
5435
344k
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5436
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5437
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5438
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5439
        // means that it is no longer necessary to allocate several times the required amount
5440
        // of memory.
5441
96.6k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5442
96.6k
        if (ch < 0xc4) { // latin1
5443
14.1k
            maxchr = 0xff;
5444
14.1k
        }
5445
82.5k
        else if (ch < 0xf0) { // ucs2
5446
72.4k
            maxchr = 0xffff;
5447
72.4k
        }
5448
10.1k
        else { // ucs4
5449
10.1k
            maxchr = 0x10ffff;
5450
10.1k
        }
5451
96.6k
    }
5452
344k
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5453
344k
    if (!u) {
5454
0
        return NULL;
5455
0
    }
5456
5457
    // Use _PyUnicodeWriter after fast path is failed.
5458
344k
    _PyUnicodeWriter writer;
5459
344k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5460
344k
    if (maxchr <= 255) {
5461
261k
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5462
261k
        s += pos;
5463
261k
        size -= pos;
5464
261k
        writer.pos = pos;
5465
261k
    }
5466
5467
344k
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5468
344k
                                 error_handler, errors,
5469
344k
                                 consumed) < 0) {
5470
3.59k
        _PyUnicodeWriter_Dealloc(&writer);
5471
3.59k
        return NULL;
5472
3.59k
    }
5473
340k
    return _PyUnicodeWriter_Finish(&writer);
5474
344k
}
5475
5476
5477
// Used by PyUnicodeWriter_WriteUTF8() implementation
5478
static int
5479
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
5480
                           const char *s, Py_ssize_t size,
5481
                           _Py_error_handler error_handler, const char *errors,
5482
                           Py_ssize_t *consumed)
5483
4.55M
{
5484
4.55M
    if (size == 0) {
5485
8.57k
        if (consumed) {
5486
0
            *consumed = 0;
5487
0
        }
5488
8.57k
        return 0;
5489
8.57k
    }
5490
5491
    // fast path: try ASCII string.
5492
4.54M
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5493
0
        return -1;
5494
0
    }
5495
5496
4.54M
    const char *starts = s;
5497
4.54M
    const char *end = s + size;
5498
4.54M
    Py_ssize_t decoded = 0;
5499
4.54M
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5500
4.54M
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5501
4.54M
        decoded = ascii_decode(s, end, dest);
5502
4.54M
        writer->pos += decoded;
5503
5504
4.54M
        if (decoded == size) {
5505
4.50M
            if (consumed) {
5506
1.14k
                *consumed = size;
5507
1.14k
            }
5508
4.50M
            return 0;
5509
4.50M
        }
5510
40.4k
        s += decoded;
5511
40.4k
        size -= decoded;
5512
40.4k
    }
5513
5514
42.6k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5515
42.6k
                                    error_handler, errors, consumed);
5516
4.54M
}
5517
5518
5519
PyObject *
5520
PyUnicode_DecodeUTF8Stateful(const char *s,
5521
                             Py_ssize_t size,
5522
                             const char *errors,
5523
                             Py_ssize_t *consumed)
5524
10.8M
{
5525
10.8M
    return unicode_decode_utf8(s, size,
5526
10.8M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5527
10.8M
                               errors, consumed);
5528
10.8M
}
5529
5530
5531
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5532
   non-zero, use strict error handler otherwise.
5533
5534
   On success, write a pointer to a newly allocated wide character string into
5535
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5536
   (in number of wchar_t units) into *wlen (if wlen is set).
5537
5538
   On memory allocation failure, return -1.
5539
5540
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5541
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5542
   is not NULL, write the decoding error message into *reason. */
5543
int
5544
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5545
                 const char **reason, _Py_error_handler errors)
5546
5.24k
{
5547
5.24k
    const char *orig_s = s;
5548
5.24k
    const char *e;
5549
5.24k
    wchar_t *unicode;
5550
5.24k
    Py_ssize_t outpos;
5551
5552
5.24k
    int surrogateescape = 0;
5553
5.24k
    int surrogatepass = 0;
5554
5.24k
    switch (errors)
5555
5.24k
    {
5556
0
    case _Py_ERROR_STRICT:
5557
0
        break;
5558
5.24k
    case _Py_ERROR_SURROGATEESCAPE:
5559
5.24k
        surrogateescape = 1;
5560
5.24k
        break;
5561
0
    case _Py_ERROR_SURROGATEPASS:
5562
0
        surrogatepass = 1;
5563
0
        break;
5564
0
    default:
5565
0
        return -3;
5566
5.24k
    }
5567
5568
    /* Note: size will always be longer than the resulting Unicode
5569
       character count */
5570
5.24k
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5571
0
        return -1;
5572
0
    }
5573
5574
5.24k
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5575
5.24k
    if (!unicode) {
5576
0
        return -1;
5577
0
    }
5578
5579
    /* Unpack UTF-8 encoded data */
5580
5.24k
    e = s + size;
5581
5.24k
    outpos = 0;
5582
5.24k
    while (s < e) {
5583
5.24k
        Py_UCS4 ch;
5584
5.24k
#if SIZEOF_WCHAR_T == 4
5585
5.24k
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5586
#else
5587
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5588
#endif
5589
5.24k
        if (ch > 0xFF) {
5590
0
#if SIZEOF_WCHAR_T == 4
5591
0
            Py_UNREACHABLE();
5592
#else
5593
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5594
            /* write a surrogate pair */
5595
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5596
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5597
#endif
5598
0
        }
5599
5.24k
        else {
5600
5.24k
            if (!ch && s == e) {
5601
5.24k
                break;
5602
5.24k
            }
5603
5604
0
            if (surrogateescape) {
5605
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5606
0
            }
5607
0
            else {
5608
                /* Is it a valid three-byte code? */
5609
0
                if (surrogatepass
5610
0
                    && (e - s) >= 3
5611
0
                    && (s[0] & 0xf0) == 0xe0
5612
0
                    && (s[1] & 0xc0) == 0x80
5613
0
                    && (s[2] & 0xc0) == 0x80)
5614
0
                {
5615
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5616
0
                    s += 3;
5617
0
                    unicode[outpos++] = ch;
5618
0
                }
5619
0
                else {
5620
0
                    PyMem_RawFree(unicode );
5621
0
                    if (reason != NULL) {
5622
0
                        switch (ch) {
5623
0
                        case 0:
5624
0
                            *reason = "unexpected end of data";
5625
0
                            break;
5626
0
                        case 1:
5627
0
                            *reason = "invalid start byte";
5628
0
                            break;
5629
                        /* 2, 3, 4 */
5630
0
                        default:
5631
0
                            *reason = "invalid continuation byte";
5632
0
                            break;
5633
0
                        }
5634
0
                    }
5635
0
                    if (wlen != NULL) {
5636
0
                        *wlen = s - orig_s;
5637
0
                    }
5638
0
                    return -2;
5639
0
                }
5640
0
            }
5641
0
        }
5642
5.24k
    }
5643
5.24k
    unicode[outpos] = L'\0';
5644
5.24k
    if (wlen) {
5645
5.24k
        *wlen = outpos;
5646
5.24k
    }
5647
5.24k
    *wstr = unicode;
5648
5.24k
    return 0;
5649
5.24k
}
5650
5651
5652
wchar_t*
5653
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5654
                               size_t *wlen)
5655
0
{
5656
0
    wchar_t *wstr;
5657
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5658
0
                               &wstr, wlen,
5659
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5660
0
    if (res != 0) {
5661
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5662
0
        assert(res != -3);
5663
0
        if (wlen) {
5664
0
            *wlen = (size_t)res;
5665
0
        }
5666
0
        return NULL;
5667
0
    }
5668
0
    return wstr;
5669
0
}
5670
5671
5672
/* UTF-8 encoder.
5673
5674
   On success, return 0 and write the newly allocated character string (use
5675
   PyMem_Free() to free the memory) into *str.
5676
5677
   On encoding failure, return -2 and write the position of the invalid
5678
   surrogate character into *error_pos (if error_pos is set) and the decoding
5679
   error message into *reason (if reason is set).
5680
5681
   On memory allocation failure, return -1. */
5682
int
5683
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5684
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5685
650
{
5686
650
    const Py_ssize_t max_char_size = 4;
5687
650
    Py_ssize_t len = wcslen(text);
5688
5689
650
    assert(len >= 0);
5690
5691
650
    int surrogateescape = 0;
5692
650
    int surrogatepass = 0;
5693
650
    switch (errors)
5694
650
    {
5695
64
    case _Py_ERROR_STRICT:
5696
64
        break;
5697
586
    case _Py_ERROR_SURROGATEESCAPE:
5698
586
        surrogateescape = 1;
5699
586
        break;
5700
0
    case _Py_ERROR_SURROGATEPASS:
5701
0
        surrogatepass = 1;
5702
0
        break;
5703
0
    default:
5704
0
        return -3;
5705
650
    }
5706
5707
650
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5708
0
        return -1;
5709
0
    }
5710
650
    char *bytes;
5711
650
    if (raw_malloc) {
5712
650
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5713
650
    }
5714
0
    else {
5715
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5716
0
    }
5717
650
    if (bytes == NULL) {
5718
0
        return -1;
5719
0
    }
5720
5721
650
    char *p = bytes;
5722
650
    Py_ssize_t i;
5723
43.6k
    for (i = 0; i < len; ) {
5724
42.9k
        Py_ssize_t ch_pos = i;
5725
42.9k
        Py_UCS4 ch = text[i];
5726
42.9k
        i++;
5727
#if Py_UNICODE_SIZE == 2
5728
        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5729
            && i < len
5730
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5731
        {
5732
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5733
            i++;
5734
        }
5735
#endif
5736
5737
42.9k
        if (ch < 0x80) {
5738
            /* Encode ASCII */
5739
42.9k
            *p++ = (char) ch;
5740
5741
42.9k
        }
5742
0
        else if (ch < 0x0800) {
5743
            /* Encode Latin-1 */
5744
0
            *p++ = (char)(0xc0 | (ch >> 6));
5745
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5746
0
        }
5747
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5748
            /* surrogateescape error handler */
5749
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5750
0
                if (error_pos != NULL) {
5751
0
                    *error_pos = (size_t)ch_pos;
5752
0
                }
5753
0
                if (reason != NULL) {
5754
0
                    *reason = "encoding error";
5755
0
                }
5756
0
                if (raw_malloc) {
5757
0
                    PyMem_RawFree(bytes);
5758
0
                }
5759
0
                else {
5760
0
                    PyMem_Free(bytes);
5761
0
                }
5762
0
                return -2;
5763
0
            }
5764
0
            *p++ = (char)(ch & 0xff);
5765
0
        }
5766
0
        else if (ch < 0x10000) {
5767
0
            *p++ = (char)(0xe0 | (ch >> 12));
5768
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5769
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5770
0
        }
5771
0
        else {  /* ch >= 0x10000 */
5772
0
            assert(ch <= MAX_UNICODE);
5773
            /* Encode UCS4 Unicode ordinals */
5774
0
            *p++ = (char)(0xf0 | (ch >> 18));
5775
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5776
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5777
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5778
0
        }
5779
42.9k
    }
5780
650
    *p++ = '\0';
5781
5782
650
    size_t final_size = (p - bytes);
5783
650
    char *bytes2;
5784
650
    if (raw_malloc) {
5785
650
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5786
650
    }
5787
0
    else {
5788
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5789
0
    }
5790
650
    if (bytes2 == NULL) {
5791
0
        if (error_pos != NULL) {
5792
0
            *error_pos = (size_t)-1;
5793
0
        }
5794
0
        if (raw_malloc) {
5795
0
            PyMem_RawFree(bytes);
5796
0
        }
5797
0
        else {
5798
0
            PyMem_Free(bytes);
5799
0
        }
5800
0
        return -1;
5801
0
    }
5802
650
    *str = bytes2;
5803
650
    return 0;
5804
650
}
5805
5806
5807
/* Primary internal function which creates utf8 encoded bytes objects.
5808
5809
   Allocation strategy:  if the string is short, convert into a stack buffer
5810
   and allocate exactly as much space needed at the end.  Else allocate the
5811
   maximum possible needed (4 result bytes per Unicode character), and return
5812
   the excess memory at the end.
5813
*/
5814
static PyObject *
5815
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5816
                    const char *errors)
5817
14.1M
{
5818
14.1M
    if (!PyUnicode_Check(unicode)) {
5819
0
        PyErr_BadArgument();
5820
0
        return NULL;
5821
0
    }
5822
5823
14.1M
    if (PyUnicode_UTF8(unicode))
5824
8.43M
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5825
8.43M
                                         PyUnicode_UTF8_LENGTH(unicode));
5826
5827
5.71M
    int kind = PyUnicode_KIND(unicode);
5828
5.71M
    const void *data = PyUnicode_DATA(unicode);
5829
5.71M
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5830
5831
5.71M
    _PyBytesWriter writer;
5832
5.71M
    char *end;
5833
5834
5.71M
    switch (kind) {
5835
0
    default:
5836
0
        Py_UNREACHABLE();
5837
4.30M
    case PyUnicode_1BYTE_KIND:
5838
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5839
4.30M
        assert(!PyUnicode_IS_ASCII(unicode));
5840
4.30M
        end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5841
4.30M
        break;
5842
1.34M
    case PyUnicode_2BYTE_KIND:
5843
1.34M
        end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5844
1.34M
        break;
5845
61.6k
    case PyUnicode_4BYTE_KIND:
5846
61.6k
        end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5847
61.6k
        break;
5848
5.71M
    }
5849
5850
5.71M
    if (end == NULL) {
5851
159k
        _PyBytesWriter_Dealloc(&writer);
5852
159k
        return NULL;
5853
159k
    }
5854
5.55M
    return _PyBytesWriter_Finish(&writer, end);
5855
5.71M
}
5856
5857
static int
5858
unicode_fill_utf8(PyObject *unicode)
5859
152k
{
5860
152k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5861
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5862
152k
    assert(!PyUnicode_IS_ASCII(unicode));
5863
5864
152k
    int kind = PyUnicode_KIND(unicode);
5865
152k
    const void *data = PyUnicode_DATA(unicode);
5866
152k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5867
5868
152k
    _PyBytesWriter writer;
5869
152k
    char *end;
5870
5871
152k
    switch (kind) {
5872
0
    default:
5873
0
        Py_UNREACHABLE();
5874
124k
    case PyUnicode_1BYTE_KIND:
5875
124k
        end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5876
124k
                                   _Py_ERROR_STRICT, NULL);
5877
124k
        break;
5878
23.2k
    case PyUnicode_2BYTE_KIND:
5879
23.2k
        end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5880
23.2k
                                   _Py_ERROR_STRICT, NULL);
5881
23.2k
        break;
5882
4.68k
    case PyUnicode_4BYTE_KIND:
5883
4.68k
        end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5884
4.68k
                                   _Py_ERROR_STRICT, NULL);
5885
4.68k
        break;
5886
152k
    }
5887
152k
    if (end == NULL) {
5888
206
        _PyBytesWriter_Dealloc(&writer);
5889
206
        return -1;
5890
206
    }
5891
5892
152k
    const char *start = writer.use_small_buffer ? writer.small_buffer :
5893
152k
                    PyBytes_AS_STRING(writer.buffer);
5894
152k
    Py_ssize_t len = end - start;
5895
5896
152k
    char *cache = PyMem_Malloc(len + 1);
5897
152k
    if (cache == NULL) {
5898
0
        _PyBytesWriter_Dealloc(&writer);
5899
0
        PyErr_NoMemory();
5900
0
        return -1;
5901
0
    }
5902
152k
    memcpy(cache, start, len);
5903
152k
    cache[len] = '\0';
5904
152k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5905
152k
    PyUnicode_SET_UTF8(unicode, cache);
5906
152k
    _PyBytesWriter_Dealloc(&writer);
5907
152k
    return 0;
5908
152k
}
5909
5910
PyObject *
5911
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5912
14.1M
{
5913
14.1M
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5914
14.1M
}
5915
5916
5917
PyObject *
5918
PyUnicode_AsUTF8String(PyObject *unicode)
5919
3.01k
{
5920
3.01k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5921
3.01k
}
5922
5923
/* --- UTF-32 Codec ------------------------------------------------------- */
5924
5925
PyObject *
5926
PyUnicode_DecodeUTF32(const char *s,
5927
                      Py_ssize_t size,
5928
                      const char *errors,
5929
                      int *byteorder)
5930
121
{
5931
121
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5932
121
}
5933
5934
PyObject *
5935
PyUnicode_DecodeUTF32Stateful(const char *s,
5936
                              Py_ssize_t size,
5937
                              const char *errors,
5938
                              int *byteorder,
5939
                              Py_ssize_t *consumed)
5940
17.6k
{
5941
17.6k
    const char *starts = s;
5942
17.6k
    Py_ssize_t startinpos;
5943
17.6k
    Py_ssize_t endinpos;
5944
17.6k
    _PyUnicodeWriter writer;
5945
17.6k
    const unsigned char *q, *e;
5946
17.6k
    int le, bo = 0;       /* assume native ordering by default */
5947
17.6k
    const char *encoding;
5948
17.6k
    const char *errmsg = "";
5949
17.6k
    PyObject *errorHandler = NULL;
5950
17.6k
    PyObject *exc = NULL;
5951
5952
17.6k
    q = (const unsigned char *)s;
5953
17.6k
    e = q + size;
5954
5955
17.6k
    if (byteorder)
5956
17.5k
        bo = *byteorder;
5957
5958
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5959
       byte order setting accordingly. In native mode, the leading BOM
5960
       mark is skipped, in all other modes, it is copied to the output
5961
       stream as-is (giving a ZWNBSP character). */
5962
17.6k
    if (bo == 0 && size >= 4) {
5963
15.9k
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5964
15.9k
        if (bom == 0x0000FEFF) {
5965
274
            bo = -1;
5966
274
            q += 4;
5967
274
        }
5968
15.6k
        else if (bom == 0xFFFE0000) {
5969
202
            bo = 1;
5970
202
            q += 4;
5971
202
        }
5972
15.9k
        if (byteorder)
5973
15.7k
            *byteorder = bo;
5974
15.9k
    }
5975
5976
17.6k
    if (q == e) {
5977
79
        if (consumed)
5978
0
            *consumed = size;
5979
79
        _Py_RETURN_UNICODE_EMPTY();
5980
79
    }
5981
5982
#ifdef WORDS_BIGENDIAN
5983
    le = bo < 0;
5984
#else
5985
17.5k
    le = bo <= 0;
5986
17.5k
#endif
5987
17.5k
    encoding = le ? "utf-32-le" : "utf-32-be";
5988
5989
17.5k
    _PyUnicodeWriter_Init(&writer);
5990
17.5k
    writer.min_length = (e - q + 3) / 4;
5991
17.5k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5992
0
        goto onError;
5993
5994
101k
    while (1) {
5995
101k
        Py_UCS4 ch = 0;
5996
101k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5997
5998
101k
        if (e - q >= 4) {
5999
89.2k
            int kind = writer.kind;
6000
89.2k
            void *data = writer.data;
6001
89.2k
            const unsigned char *last = e - 4;
6002
89.2k
            Py_ssize_t pos = writer.pos;
6003
89.2k
            if (le) {
6004
108k
                do {
6005
108k
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
6006
108k
                    if (ch > maxch)
6007
84.5k
                        break;
6008
23.5k
                    if (kind != PyUnicode_1BYTE_KIND &&
6009
23.5k
                        Py_UNICODE_IS_SURROGATE(ch))
6010
233
                        break;
6011
23.2k
                    PyUnicode_WRITE(kind, data, pos++, ch);
6012
23.2k
                    q += 4;
6013
23.2k
                } while (q <= last);
6014
85.8k
            }
6015
3.46k
            else {
6016
37.2k
                do {
6017
37.2k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
6018
37.2k
                    if (ch > maxch)
6019
3.24k
                        break;
6020
33.9k
                    if (kind != PyUnicode_1BYTE_KIND &&
6021
33.9k
                        Py_UNICODE_IS_SURROGATE(ch))
6022
100
                        break;
6023
33.8k
                    PyUnicode_WRITE(kind, data, pos++, ch);
6024
33.8k
                    q += 4;
6025
33.8k
                } while (q <= last);
6026
3.46k
            }
6027
0
            writer.pos = pos;
6028
89.2k
        }
6029
6030
101k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
6031
336
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
6032
336
            startinpos = ((const char *)q) - starts;
6033
336
            endinpos = startinpos + 4;
6034
336
        }
6035
100k
        else if (ch <= maxch) {
6036
12.9k
            if (q == e || consumed)
6037
2.98k
                break;
6038
            /* remaining bytes at the end? (size should be divisible by 4) */
6039
9.97k
            errmsg = "truncated data";
6040
9.97k
            startinpos = ((const char *)q) - starts;
6041
9.97k
            endinpos = ((const char *)e) - starts;
6042
9.97k
        }
6043
87.7k
        else {
6044
87.7k
            if (ch < 0x110000) {
6045
3.59k
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6046
0
                    goto onError;
6047
3.59k
                q += 4;
6048
3.59k
                continue;
6049
3.59k
            }
6050
84.1k
            errmsg = "code point not in range(0x110000)";
6051
84.1k
            startinpos = ((const char *)q) - starts;
6052
84.1k
            endinpos = startinpos + 4;
6053
84.1k
        }
6054
6055
        /* The remaining input chars are ignored if the callback
6056
           chooses to skip the input */
6057
94.4k
        if (unicode_decode_call_errorhandler_writer(
6058
94.4k
                errors, &errorHandler,
6059
94.4k
                encoding, errmsg,
6060
94.4k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
6061
94.4k
                &writer))
6062
14.5k
            goto onError;
6063
94.4k
    }
6064
6065
2.98k
    if (consumed)
6066
0
        *consumed = (const char *)q-starts;
6067
6068
2.98k
    Py_XDECREF(errorHandler);
6069
2.98k
    Py_XDECREF(exc);
6070
2.98k
    return _PyUnicodeWriter_Finish(&writer);
6071
6072
14.5k
  onError:
6073
14.5k
    _PyUnicodeWriter_Dealloc(&writer);
6074
14.5k
    Py_XDECREF(errorHandler);
6075
14.5k
    Py_XDECREF(exc);
6076
14.5k
    return NULL;
6077
17.5k
}
6078
6079
PyObject *
6080
_PyUnicode_EncodeUTF32(PyObject *str,
6081
                       const char *errors,
6082
                       int byteorder)
6083
0
{
6084
0
    int kind;
6085
0
    const void *data;
6086
0
    Py_ssize_t len;
6087
0
    PyObject *v;
6088
0
    uint32_t *out;
6089
0
#if PY_LITTLE_ENDIAN
6090
0
    int native_ordering = byteorder <= 0;
6091
#else
6092
    int native_ordering = byteorder >= 0;
6093
#endif
6094
0
    const char *encoding;
6095
0
    Py_ssize_t nsize, pos;
6096
0
    PyObject *errorHandler = NULL;
6097
0
    PyObject *exc = NULL;
6098
0
    PyObject *rep = NULL;
6099
6100
0
    if (!PyUnicode_Check(str)) {
6101
0
        PyErr_BadArgument();
6102
0
        return NULL;
6103
0
    }
6104
0
    kind = PyUnicode_KIND(str);
6105
0
    data = PyUnicode_DATA(str);
6106
0
    len = PyUnicode_GET_LENGTH(str);
6107
6108
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
6109
0
        return PyErr_NoMemory();
6110
0
    nsize = len + (byteorder == 0);
6111
0
    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
6112
0
    if (v == NULL)
6113
0
        return NULL;
6114
6115
    /* output buffer is 4-bytes aligned */
6116
0
    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
6117
0
    out = (uint32_t *)PyBytes_AS_STRING(v);
6118
0
    if (byteorder == 0)
6119
0
        *out++ = 0xFEFF;
6120
0
    if (len == 0)
6121
0
        goto done;
6122
6123
0
    if (byteorder == -1)
6124
0
        encoding = "utf-32-le";
6125
0
    else if (byteorder == 1)
6126
0
        encoding = "utf-32-be";
6127
0
    else
6128
0
        encoding = "utf-32";
6129
6130
0
    if (kind == PyUnicode_1BYTE_KIND) {
6131
0
        ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6132
0
        goto done;
6133
0
    }
6134
6135
0
    pos = 0;
6136
0
    while (pos < len) {
6137
0
        Py_ssize_t newpos, repsize, moreunits;
6138
6139
0
        if (kind == PyUnicode_2BYTE_KIND) {
6140
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6141
0
                                        &out, native_ordering);
6142
0
        }
6143
0
        else {
6144
0
            assert(kind == PyUnicode_4BYTE_KIND);
6145
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6146
0
                                        &out, native_ordering);
6147
0
        }
6148
0
        if (pos == len)
6149
0
            break;
6150
6151
0
        rep = unicode_encode_call_errorhandler(
6152
0
                errors, &errorHandler,
6153
0
                encoding, "surrogates not allowed",
6154
0
                str, &exc, pos, pos + 1, &newpos);
6155
0
        if (!rep)
6156
0
            goto error;
6157
6158
0
        if (PyBytes_Check(rep)) {
6159
0
            repsize = PyBytes_GET_SIZE(rep);
6160
0
            if (repsize & 3) {
6161
0
                raise_encode_exception(&exc, encoding,
6162
0
                                       str, pos, pos + 1,
6163
0
                                       "surrogates not allowed");
6164
0
                goto error;
6165
0
            }
6166
0
            moreunits = repsize / 4;
6167
0
        }
6168
0
        else {
6169
0
            assert(PyUnicode_Check(rep));
6170
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6171
0
            if (!PyUnicode_IS_ASCII(rep)) {
6172
0
                raise_encode_exception(&exc, encoding,
6173
0
                                       str, pos, pos + 1,
6174
0
                                       "surrogates not allowed");
6175
0
                goto error;
6176
0
            }
6177
0
        }
6178
0
        moreunits += pos - newpos;
6179
0
        pos = newpos;
6180
6181
        /* four bytes are reserved for each surrogate */
6182
0
        if (moreunits > 0) {
6183
0
            Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
6184
0
            if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
6185
                /* integer overflow */
6186
0
                PyErr_NoMemory();
6187
0
                goto error;
6188
0
            }
6189
0
            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
6190
0
                goto error;
6191
0
            out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
6192
0
        }
6193
6194
0
        if (PyBytes_Check(rep)) {
6195
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6196
0
            out += repsize / 4;
6197
0
        } else /* rep is unicode */ {
6198
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6199
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6200
0
                                 &out, native_ordering);
6201
0
        }
6202
6203
0
        Py_CLEAR(rep);
6204
0
    }
6205
6206
    /* Cut back to size actually needed. This is necessary for, for example,
6207
       encoding of a string containing isolated surrogates and the 'ignore'
6208
       handler is used. */
6209
0
    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6210
0
    if (nsize != PyBytes_GET_SIZE(v))
6211
0
      _PyBytes_Resize(&v, nsize);
6212
0
    Py_XDECREF(errorHandler);
6213
0
    Py_XDECREF(exc);
6214
0
  done:
6215
0
    return v;
6216
0
  error:
6217
0
    Py_XDECREF(rep);
6218
0
    Py_XDECREF(errorHandler);
6219
0
    Py_XDECREF(exc);
6220
0
    Py_XDECREF(v);
6221
0
    return NULL;
6222
0
}
6223
6224
PyObject *
6225
PyUnicode_AsUTF32String(PyObject *unicode)
6226
0
{
6227
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6228
0
}
6229
6230
/* --- UTF-16 Codec ------------------------------------------------------- */
6231
6232
PyObject *
6233
PyUnicode_DecodeUTF16(const char *s,
6234
                      Py_ssize_t size,
6235
                      const char *errors,
6236
                      int *byteorder)
6237
150
{
6238
150
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6239
150
}
6240
6241
PyObject *
6242
PyUnicode_DecodeUTF16Stateful(const char *s,
6243
                              Py_ssize_t size,
6244
                              const char *errors,
6245
                              int *byteorder,
6246
                              Py_ssize_t *consumed)
6247
13.4k
{
6248
13.4k
    const char *starts = s;
6249
13.4k
    Py_ssize_t startinpos;
6250
13.4k
    Py_ssize_t endinpos;
6251
13.4k
    _PyUnicodeWriter writer;
6252
13.4k
    const unsigned char *q, *e;
6253
13.4k
    int bo = 0;       /* assume native ordering by default */
6254
13.4k
    int native_ordering;
6255
13.4k
    const char *errmsg = "";
6256
13.4k
    PyObject *errorHandler = NULL;
6257
13.4k
    PyObject *exc = NULL;
6258
13.4k
    const char *encoding;
6259
6260
13.4k
    q = (const unsigned char *)s;
6261
13.4k
    e = q + size;
6262
6263
13.4k
    if (byteorder)
6264
13.3k
        bo = *byteorder;
6265
6266
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6267
       byte order setting accordingly. In native mode, the leading BOM
6268
       mark is skipped, in all other modes, it is copied to the output
6269
       stream as-is (giving a ZWNBSP character). */
6270
13.4k
    if (bo == 0 && size >= 2) {
6271
12.7k
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6272
12.7k
        if (bom == 0xFEFF) {
6273
366
            q += 2;
6274
366
            bo = -1;
6275
366
        }
6276
12.3k
        else if (bom == 0xFFFE) {
6277
2.19k
            q += 2;
6278
2.19k
            bo = 1;
6279
2.19k
        }
6280
12.7k
        if (byteorder)
6281
12.5k
            *byteorder = bo;
6282
12.7k
    }
6283
6284
13.4k
    if (q == e) {
6285
69
        if (consumed)
6286
0
            *consumed = size;
6287
69
        _Py_RETURN_UNICODE_EMPTY();
6288
69
    }
6289
6290
13.3k
#if PY_LITTLE_ENDIAN
6291
13.3k
    native_ordering = bo <= 0;
6292
13.3k
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6293
#else
6294
    native_ordering = bo >= 0;
6295
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6296
#endif
6297
6298
    /* Note: size will always be longer than the resulting Unicode
6299
       character count normally.  Error handler will take care of
6300
       resizing when needed. */
6301
13.3k
    _PyUnicodeWriter_Init(&writer);
6302
13.3k
    writer.min_length = (e - q + 1) / 2;
6303
13.3k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6304
0
        goto onError;
6305
6306
50.8k
    while (1) {
6307
50.8k
        Py_UCS4 ch = 0;
6308
50.8k
        if (e - q >= 2) {
6309
43.6k
            int kind = writer.kind;
6310
43.6k
            if (kind == PyUnicode_1BYTE_KIND) {
6311
15.9k
                if (PyUnicode_IS_ASCII(writer.buffer))
6312
12.8k
                    ch = asciilib_utf16_decode(&q, e,
6313
12.8k
                            (Py_UCS1*)writer.data, &writer.pos,
6314
12.8k
                            native_ordering);
6315
3.18k
                else
6316
3.18k
                    ch = ucs1lib_utf16_decode(&q, e,
6317
3.18k
                            (Py_UCS1*)writer.data, &writer.pos,
6318
3.18k
                            native_ordering);
6319
27.6k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6320
11.4k
                ch = ucs2lib_utf16_decode(&q, e,
6321
11.4k
                        (Py_UCS2*)writer.data, &writer.pos,
6322
11.4k
                        native_ordering);
6323
16.1k
            } else {
6324
16.1k
                assert(kind == PyUnicode_4BYTE_KIND);
6325
16.1k
                ch = ucs4lib_utf16_decode(&q, e,
6326
16.1k
                        (Py_UCS4*)writer.data, &writer.pos,
6327
16.1k
                        native_ordering);
6328
16.1k
            }
6329
43.6k
        }
6330
6331
50.8k
        switch (ch)
6332
50.8k
        {
6333
13.0k
        case 0:
6334
            /* remaining byte at the end? (size should be even) */
6335
13.0k
            if (q == e || consumed)
6336
8.54k
                goto End;
6337
4.51k
            errmsg = "truncated data";
6338
4.51k
            startinpos = ((const char *)q) - starts;
6339
4.51k
            endinpos = ((const char *)e) - starts;
6340
4.51k
            break;
6341
            /* The remaining input chars are ignored if the callback
6342
               chooses to skip the input */
6343
1.79k
        case 1:
6344
1.79k
            q -= 2;
6345
1.79k
            if (consumed)
6346
0
                goto End;
6347
1.79k
            errmsg = "unexpected end of data";
6348
1.79k
            startinpos = ((const char *)q) - starts;
6349
1.79k
            endinpos = ((const char *)e) - starts;
6350
1.79k
            break;
6351
14.1k
        case 2:
6352
14.1k
            errmsg = "illegal encoding";
6353
14.1k
            startinpos = ((const char *)q) - 2 - starts;
6354
14.1k
            endinpos = startinpos + 2;
6355
14.1k
            break;
6356
6.56k
        case 3:
6357
6.56k
            errmsg = "illegal UTF-16 surrogate";
6358
6.56k
            startinpos = ((const char *)q) - 4 - starts;
6359
6.56k
            endinpos = startinpos + 2;
6360
6.56k
            break;
6361
15.2k
        default:
6362
15.2k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6363
0
                goto onError;
6364
15.2k
            continue;
6365
50.8k
        }
6366
6367
27.0k
        if (unicode_decode_call_errorhandler_writer(
6368
27.0k
                errors,
6369
27.0k
                &errorHandler,
6370
27.0k
                encoding, errmsg,
6371
27.0k
                &starts,
6372
27.0k
                (const char **)&e,
6373
27.0k
                &startinpos,
6374
27.0k
                &endinpos,
6375
27.0k
                &exc,
6376
27.0k
                (const char **)&q,
6377
27.0k
                &writer))
6378
4.84k
            goto onError;
6379
27.0k
    }
6380
6381
8.54k
End:
6382
8.54k
    if (consumed)
6383
0
        *consumed = (const char *)q-starts;
6384
6385
8.54k
    Py_XDECREF(errorHandler);
6386
8.54k
    Py_XDECREF(exc);
6387
8.54k
    return _PyUnicodeWriter_Finish(&writer);
6388
6389
4.84k
  onError:
6390
4.84k
    _PyUnicodeWriter_Dealloc(&writer);
6391
4.84k
    Py_XDECREF(errorHandler);
6392
4.84k
    Py_XDECREF(exc);
6393
4.84k
    return NULL;
6394
13.3k
}
6395
6396
PyObject *
6397
_PyUnicode_EncodeUTF16(PyObject *str,
6398
                       const char *errors,
6399
                       int byteorder)
6400
0
{
6401
0
    int kind;
6402
0
    const void *data;
6403
0
    Py_ssize_t len;
6404
0
    PyObject *v;
6405
0
    unsigned short *out;
6406
0
    Py_ssize_t pairs;
6407
#if PY_BIG_ENDIAN
6408
    int native_ordering = byteorder >= 0;
6409
#else
6410
0
    int native_ordering = byteorder <= 0;
6411
0
#endif
6412
0
    const char *encoding;
6413
0
    Py_ssize_t nsize, pos;
6414
0
    PyObject *errorHandler = NULL;
6415
0
    PyObject *exc = NULL;
6416
0
    PyObject *rep = NULL;
6417
6418
0
    if (!PyUnicode_Check(str)) {
6419
0
        PyErr_BadArgument();
6420
0
        return NULL;
6421
0
    }
6422
0
    kind = PyUnicode_KIND(str);
6423
0
    data = PyUnicode_DATA(str);
6424
0
    len = PyUnicode_GET_LENGTH(str);
6425
6426
0
    pairs = 0;
6427
0
    if (kind == PyUnicode_4BYTE_KIND) {
6428
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6429
0
        const Py_UCS4 *end = in + len;
6430
0
        while (in < end) {
6431
0
            if (*in++ >= 0x10000) {
6432
0
                pairs++;
6433
0
            }
6434
0
        }
6435
0
    }
6436
0
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6437
0
        return PyErr_NoMemory();
6438
0
    }
6439
0
    nsize = len + pairs + (byteorder == 0);
6440
0
    v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6441
0
    if (v == NULL) {
6442
0
        return NULL;
6443
0
    }
6444
6445
    /* output buffer is 2-bytes aligned */
6446
0
    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6447
0
    out = (unsigned short *)PyBytes_AS_STRING(v);
6448
0
    if (byteorder == 0) {
6449
0
        *out++ = 0xFEFF;
6450
0
    }
6451
0
    if (len == 0) {
6452
0
        goto done;
6453
0
    }
6454
6455
0
    if (kind == PyUnicode_1BYTE_KIND) {
6456
0
        ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6457
0
        goto done;
6458
0
    }
6459
6460
0
    if (byteorder < 0) {
6461
0
        encoding = "utf-16-le";
6462
0
    }
6463
0
    else if (byteorder > 0) {
6464
0
        encoding = "utf-16-be";
6465
0
    }
6466
0
    else {
6467
0
        encoding = "utf-16";
6468
0
    }
6469
6470
0
    pos = 0;
6471
0
    while (pos < len) {
6472
0
        Py_ssize_t newpos, repsize, moreunits;
6473
6474
0
        if (kind == PyUnicode_2BYTE_KIND) {
6475
0
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6476
0
                                        &out, native_ordering);
6477
0
        }
6478
0
        else {
6479
0
            assert(kind == PyUnicode_4BYTE_KIND);
6480
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6481
0
                                        &out, native_ordering);
6482
0
        }
6483
0
        if (pos == len)
6484
0
            break;
6485
6486
0
        rep = unicode_encode_call_errorhandler(
6487
0
                errors, &errorHandler,
6488
0
                encoding, "surrogates not allowed",
6489
0
                str, &exc, pos, pos + 1, &newpos);
6490
0
        if (!rep)
6491
0
            goto error;
6492
6493
0
        if (PyBytes_Check(rep)) {
6494
0
            repsize = PyBytes_GET_SIZE(rep);
6495
0
            if (repsize & 1) {
6496
0
                raise_encode_exception(&exc, encoding,
6497
0
                                       str, pos, pos + 1,
6498
0
                                       "surrogates not allowed");
6499
0
                goto error;
6500
0
            }
6501
0
            moreunits = repsize / 2;
6502
0
        }
6503
0
        else {
6504
0
            assert(PyUnicode_Check(rep));
6505
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6506
0
            if (!PyUnicode_IS_ASCII(rep)) {
6507
0
                raise_encode_exception(&exc, encoding,
6508
0
                                       str, pos, pos + 1,
6509
0
                                       "surrogates not allowed");
6510
0
                goto error;
6511
0
            }
6512
0
        }
6513
0
        moreunits += pos - newpos;
6514
0
        pos = newpos;
6515
6516
        /* two bytes are reserved for each surrogate */
6517
0
        if (moreunits > 0) {
6518
0
            Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6519
0
            if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
6520
                /* integer overflow */
6521
0
                PyErr_NoMemory();
6522
0
                goto error;
6523
0
            }
6524
0
            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
6525
0
                goto error;
6526
0
            out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6527
0
        }
6528
6529
0
        if (PyBytes_Check(rep)) {
6530
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6531
0
            out += repsize / 2;
6532
0
        } else /* rep is unicode */ {
6533
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6534
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6535
0
                                 &out, native_ordering);
6536
0
        }
6537
6538
0
        Py_CLEAR(rep);
6539
0
    }
6540
6541
    /* Cut back to size actually needed. This is necessary for, for example,
6542
    encoding of a string containing isolated surrogates and the 'ignore' handler
6543
    is used. */
6544
0
    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6545
0
    if (nsize != PyBytes_GET_SIZE(v))
6546
0
      _PyBytes_Resize(&v, nsize);
6547
0
    Py_XDECREF(errorHandler);
6548
0
    Py_XDECREF(exc);
6549
0
  done:
6550
0
    return v;
6551
0
  error:
6552
0
    Py_XDECREF(rep);
6553
0
    Py_XDECREF(errorHandler);
6554
0
    Py_XDECREF(exc);
6555
0
    Py_XDECREF(v);
6556
0
    return NULL;
6557
0
#undef STORECHAR
6558
0
}
6559
6560
PyObject *
6561
PyUnicode_AsUTF16String(PyObject *unicode)
6562
0
{
6563
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6564
0
}
6565
6566
_PyUnicode_Name_CAPI *
6567
_PyUnicode_GetNameCAPI(void)
6568
2.47k
{
6569
2.47k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6570
2.47k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6571
6572
2.47k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6573
2.47k
    if (ucnhash_capi == NULL) {
6574
1
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6575
1
                PyUnicodeData_CAPSULE_NAME, 1);
6576
6577
        // It's fine if we overwrite the value here. It's always the same value.
6578
1
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6579
1
    }
6580
2.47k
    return ucnhash_capi;
6581
2.47k
}
6582
6583
/* --- Unicode Escape Codec ----------------------------------------------- */
6584
6585
PyObject *
6586
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6587
                               Py_ssize_t size,
6588
                               const char *errors,
6589
                               Py_ssize_t *consumed,
6590
                               int *first_invalid_escape_char,
6591
                               const char **first_invalid_escape_ptr)
6592
31.2k
{
6593
31.2k
    const char *starts = s;
6594
31.2k
    const char *initial_starts = starts;
6595
31.2k
    _PyUnicodeWriter writer;
6596
31.2k
    const char *end;
6597
31.2k
    PyObject *errorHandler = NULL;
6598
31.2k
    PyObject *exc = NULL;
6599
31.2k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6600
6601
    // so we can remember if we've seen an invalid escape char or not
6602
31.2k
    *first_invalid_escape_char = -1;
6603
31.2k
    *first_invalid_escape_ptr = NULL;
6604
6605
31.2k
    if (size == 0) {
6606
1.89k
        if (consumed) {
6607
0
            *consumed = 0;
6608
0
        }
6609
1.89k
        _Py_RETURN_UNICODE_EMPTY();
6610
1.89k
    }
6611
    /* Escaped strings will always be longer than the resulting
6612
       Unicode string, so we start with size here and then reduce the
6613
       length after conversion to the true value.
6614
       (but if the error callback returns a long replacement string
6615
       we'll have to allocate more space) */
6616
29.3k
    _PyUnicodeWriter_Init(&writer);
6617
29.3k
    writer.min_length = size;
6618
29.3k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6619
0
        goto onError;
6620
0
    }
6621
6622
29.3k
    end = s + size;
6623
179k
    while (s < end) {
6624
150k
        unsigned char c = (unsigned char) *s++;
6625
150k
        Py_UCS4 ch;
6626
150k
        int count;
6627
150k
        const char *message;
6628
6629
150k
#define WRITE_ASCII_CHAR(ch)                                                  \
6630
150k
            do {                                                              \
6631
14.9k
                assert(ch <= 127);                                            \
6632
14.9k
                assert(writer.pos < writer.size);                             \
6633
14.9k
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6634
14.9k
            } while(0)
6635
6636
150k
#define WRITE_CHAR(ch)                                                        \
6637
150k
            do {                                                              \
6638
138k
                if (ch <= writer.maxchar) {                                   \
6639
122k
                    assert(writer.pos < writer.size);                         \
6640
122k
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6641
122k
                }                                                             \
6642
138k
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6643
0
                    goto onError;                                             \
6644
0
                }                                                             \
6645
138k
            } while(0)
6646
6647
        /* Non-escape characters are interpreted as Unicode ordinals */
6648
150k
        if (c != '\\') {
6649
91.0k
            WRITE_CHAR(c);
6650
91.0k
            continue;
6651
91.0k
        }
6652
6653
59.0k
        Py_ssize_t startinpos = s - starts - 1;
6654
        /* \ - Escapes */
6655
59.0k
        if (s >= end) {
6656
0
            message = "\\ at end of string";
6657
0
            goto incomplete;
6658
0
        }
6659
59.0k
        c = (unsigned char) *s++;
6660
6661
59.0k
        assert(writer.pos < writer.size);
6662
59.0k
        switch (c) {
6663
6664
            /* \x escapes */
6665
1.08k
        case '\n': continue;
6666
1.65k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6667
894
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6668
1.28k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6669
627
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6670
        /* FF */
6671
1.04k
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6672
801
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6673
1.10k
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6674
1.38k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6675
        /* VT */
6676
734
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6677
        /* BEL, not classic C */
6678
684
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6679
6680
            /* \OOO (octal) escapes */
6681
3.58k
        case '0': case '1': case '2': case '3':
6682
7.09k
        case '4': case '5': case '6': case '7':
6683
7.09k
            ch = c - '0';
6684
7.09k
            if (s < end && '0' <= *s && *s <= '7') {
6685
3.07k
                ch = (ch<<3) + *s++ - '0';
6686
3.07k
                if (s < end && '0' <= *s && *s <= '7') {
6687
1.66k
                    ch = (ch<<3) + *s++ - '0';
6688
1.66k
                }
6689
3.07k
            }
6690
7.09k
            if (ch > 0377) {
6691
1.37k
                if (*first_invalid_escape_char == -1) {
6692
1.03k
                    *first_invalid_escape_char = ch;
6693
1.03k
                    if (starts == initial_starts) {
6694
                        /* Back up 3 chars, since we've already incremented s. */
6695
1.03k
                        *first_invalid_escape_ptr = s - 3;
6696
1.03k
                    }
6697
1.03k
                }
6698
1.37k
            }
6699
7.09k
            WRITE_CHAR(ch);
6700
7.09k
            continue;
6701
6702
            /* hex escapes */
6703
            /* \xXX */
6704
7.09k
        case 'x':
6705
6.15k
            count = 2;
6706
6.15k
            message = "truncated \\xXX escape";
6707
6.15k
            goto hexescape;
6708
6709
            /* \uXXXX */
6710
9.10k
        case 'u':
6711
9.10k
            count = 4;
6712
9.10k
            message = "truncated \\uXXXX escape";
6713
9.10k
            goto hexescape;
6714
6715
            /* \UXXXXXXXX */
6716
18.1k
        case 'U':
6717
18.1k
            count = 8;
6718
18.1k
            message = "truncated \\UXXXXXXXX escape";
6719
33.4k
        hexescape:
6720
227k
            for (ch = 0; count; ++s, --count) {
6721
194k
                if (s >= end) {
6722
6
                    goto incomplete;
6723
6
                }
6724
194k
                c = (unsigned char)*s;
6725
194k
                ch <<= 4;
6726
194k
                if (c >= '0' && c <= '9') {
6727
143k
                    ch += c - '0';
6728
143k
                }
6729
50.5k
                else if (c >= 'a' && c <= 'f') {
6730
50.2k
                    ch += c - ('a' - 10);
6731
50.2k
                }
6732
245
                else if (c >= 'A' && c <= 'F') {
6733
241
                    ch += c - ('A' - 10);
6734
241
                }
6735
4
                else {
6736
4
                    goto error;
6737
4
                }
6738
194k
            }
6739
6740
            /* when we get here, ch is a 32-bit unicode character */
6741
33.4k
            if (ch > MAX_UNICODE) {
6742
1
                message = "illegal Unicode character";
6743
1
                goto error;
6744
1
            }
6745
6746
33.4k
            WRITE_CHAR(ch);
6747
33.4k
            continue;
6748
6749
            /* \N{name} */
6750
33.4k
        case 'N':
6751
2.47k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6752
2.47k
            if (ucnhash_capi == NULL) {
6753
0
                PyErr_SetString(
6754
0
                        PyExc_UnicodeError,
6755
0
                        "\\N escapes not supported (can't load unicodedata module)"
6756
0
                );
6757
0
                goto onError;
6758
0
            }
6759
6760
2.47k
            message = "malformed \\N character escape";
6761
2.47k
            if (s >= end) {
6762
4
                goto incomplete;
6763
4
            }
6764
2.46k
            if (*s == '{') {
6765
2.46k
                const char *start = ++s;
6766
2.46k
                size_t namelen;
6767
                /* look for the closing brace */
6768
38.6k
                while (s < end && *s != '}')
6769
36.1k
                    s++;
6770
2.46k
                if (s >= end) {
6771
11
                    goto incomplete;
6772
11
                }
6773
2.45k
                namelen = s - start;
6774
2.45k
                if (namelen) {
6775
                    /* found a name.  look it up in the unicode database */
6776
2.45k
                    s++;
6777
2.45k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6778
2.45k
                    if (namelen <= INT_MAX &&
6779
2.45k
                        ucnhash_capi->getcode(start, (int)namelen,
6780
2.45k
                                              &ch, 0)) {
6781
2.38k
                        assert(ch <= MAX_UNICODE);
6782
2.38k
                        WRITE_CHAR(ch);
6783
2.38k
                        continue;
6784
2.38k
                    }
6785
64
                    message = "unknown Unicode character name";
6786
64
                }
6787
2.45k
            }
6788
69
            goto error;
6789
6790
4.70k
        default:
6791
4.70k
            if (*first_invalid_escape_char == -1) {
6792
3.55k
                *first_invalid_escape_char = c;
6793
3.55k
                if (starts == initial_starts) {
6794
                    /* Back up one char, since we've already incremented s. */
6795
3.55k
                    *first_invalid_escape_ptr = s - 1;
6796
3.55k
                }
6797
3.55k
            }
6798
4.70k
            WRITE_ASCII_CHAR('\\');
6799
4.70k
            WRITE_CHAR(c);
6800
4.70k
            continue;
6801
59.0k
        }
6802
6803
21
      incomplete:
6804
21
        if (consumed) {
6805
0
            *consumed = startinpos;
6806
0
            break;
6807
0
        }
6808
95
      error:;
6809
95
        Py_ssize_t endinpos = s-starts;
6810
95
        writer.min_length = end - s + writer.pos;
6811
95
        if (unicode_decode_call_errorhandler_writer(
6812
95
                errors, &errorHandler,
6813
95
                "unicodeescape", message,
6814
95
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6815
95
                &writer)) {
6816
95
            goto onError;
6817
95
        }
6818
0
        assert(end - s <= writer.size - writer.pos);
6819
6820
0
#undef WRITE_ASCII_CHAR
6821
0
#undef WRITE_CHAR
6822
0
    }
6823
6824
29.2k
    Py_XDECREF(errorHandler);
6825
29.2k
    Py_XDECREF(exc);
6826
29.2k
    return _PyUnicodeWriter_Finish(&writer);
6827
6828
95
  onError:
6829
95
    _PyUnicodeWriter_Dealloc(&writer);
6830
95
    Py_XDECREF(errorHandler);
6831
95
    Py_XDECREF(exc);
6832
95
    return NULL;
6833
29.3k
}
6834
6835
PyObject *
6836
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6837
                              Py_ssize_t size,
6838
                              const char *errors,
6839
                              Py_ssize_t *consumed)
6840
0
{
6841
0
    int first_invalid_escape_char;
6842
0
    const char *first_invalid_escape_ptr;
6843
0
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6844
0
                                                      consumed,
6845
0
                                                      &first_invalid_escape_char,
6846
0
                                                      &first_invalid_escape_ptr);
6847
0
    if (result == NULL)
6848
0
        return NULL;
6849
0
    if (first_invalid_escape_char != -1) {
6850
0
        if (first_invalid_escape_char > 0xff) {
6851
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6852
0
                                 "\"\\%o\" is an invalid octal escape sequence. "
6853
0
                                 "Such sequences will not work in the future. ",
6854
0
                                 first_invalid_escape_char) < 0)
6855
0
            {
6856
0
                Py_DECREF(result);
6857
0
                return NULL;
6858
0
            }
6859
0
        }
6860
0
        else {
6861
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6862
0
                                 "\"\\%c\" is an invalid escape sequence. "
6863
0
                                 "Such sequences will not work in the future. ",
6864
0
                                 first_invalid_escape_char) < 0)
6865
0
            {
6866
0
                Py_DECREF(result);
6867
0
                return NULL;
6868
0
            }
6869
0
        }
6870
0
    }
6871
0
    return result;
6872
0
}
6873
6874
PyObject *
6875
PyUnicode_DecodeUnicodeEscape(const char *s,
6876
                              Py_ssize_t size,
6877
                              const char *errors)
6878
0
{
6879
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6880
0
}
6881
6882
/* Return a Unicode-Escape string version of the Unicode object. */
6883
6884
PyObject *
6885
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6886
506k
{
6887
506k
    Py_ssize_t i, len;
6888
506k
    PyObject *repr;
6889
506k
    char *p;
6890
506k
    int kind;
6891
506k
    const void *data;
6892
506k
    Py_ssize_t expandsize;
6893
6894
    /* Initial allocation is based on the longest-possible character
6895
       escape.
6896
6897
       For UCS1 strings it's '\xxx', 4 bytes per source character.
6898
       For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6899
       For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6900
    */
6901
6902
506k
    if (!PyUnicode_Check(unicode)) {
6903
0
        PyErr_BadArgument();
6904
0
        return NULL;
6905
0
    }
6906
6907
506k
    len = PyUnicode_GET_LENGTH(unicode);
6908
506k
    if (len == 0) {
6909
0
        return PyBytes_FromStringAndSize(NULL, 0);
6910
0
    }
6911
6912
506k
    kind = PyUnicode_KIND(unicode);
6913
506k
    data = PyUnicode_DATA(unicode);
6914
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6915
       bytes, and 1 byte characters 4. */
6916
506k
    expandsize = kind * 2 + 2;
6917
506k
    if (len > PY_SSIZE_T_MAX / expandsize) {
6918
0
        return PyErr_NoMemory();
6919
0
    }
6920
506k
    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6921
506k
    if (repr == NULL) {
6922
0
        return NULL;
6923
0
    }
6924
6925
506k
    p = PyBytes_AS_STRING(repr);
6926
1.01M
    for (i = 0; i < len; i++) {
6927
506k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6928
6929
        /* U+0000-U+00ff range */
6930
506k
        if (ch < 0x100) {
6931
499k
            if (ch >= ' ' && ch < 127) {
6932
35.7k
                if (ch != '\\') {
6933
                    /* Copy printable US ASCII as-is */
6934
0
                    *p++ = (char) ch;
6935
0
                }
6936
                /* Escape backslashes */
6937
35.7k
                else {
6938
35.7k
                    *p++ = '\\';
6939
35.7k
                    *p++ = '\\';
6940
35.7k
                }
6941
35.7k
            }
6942
6943
            /* Map special whitespace to '\t', \n', '\r' */
6944
463k
            else if (ch == '\t') {
6945
3.20k
                *p++ = '\\';
6946
3.20k
                *p++ = 't';
6947
3.20k
            }
6948
460k
            else if (ch == '\n') {
6949
1.49k
                *p++ = '\\';
6950
1.49k
                *p++ = 'n';
6951
1.49k
            }
6952
458k
            else if (ch == '\r') {
6953
536
                *p++ = '\\';
6954
536
                *p++ = 'r';
6955
536
            }
6956
6957
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6958
458k
            else {
6959
458k
                *p++ = '\\';
6960
458k
                *p++ = 'x';
6961
458k
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6962
458k
                *p++ = Py_hexdigits[ch & 0x000F];
6963
458k
            }
6964
499k
        }
6965
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6966
7.08k
        else if (ch < 0x10000) {
6967
5.89k
            *p++ = '\\';
6968
5.89k
            *p++ = 'u';
6969
5.89k
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6970
5.89k
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6971
5.89k
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6972
5.89k
            *p++ = Py_hexdigits[ch & 0x000F];
6973
5.89k
        }
6974
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6975
1.18k
        else {
6976
6977
            /* Make sure that the first two digits are zero */
6978
1.18k
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6979
1.18k
            *p++ = '\\';
6980
1.18k
            *p++ = 'U';
6981
1.18k
            *p++ = '0';
6982
1.18k
            *p++ = '0';
6983
1.18k
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6984
1.18k
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6985
1.18k
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6986
1.18k
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6987
1.18k
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6988
1.18k
            *p++ = Py_hexdigits[ch & 0x0000000F];
6989
1.18k
        }
6990
506k
    }
6991
6992
506k
    assert(p - PyBytes_AS_STRING(repr) > 0);
6993
506k
    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6994
0
        return NULL;
6995
0
    }
6996
506k
    return repr;
6997
506k
}
6998
6999
/* --- Raw Unicode Escape Codec ------------------------------------------- */
7000
7001
PyObject *
7002
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
7003
                                          Py_ssize_t size,
7004
                                          const char *errors,
7005
                                          Py_ssize_t *consumed)
7006
0
{
7007
0
    const char *starts = s;
7008
0
    _PyUnicodeWriter writer;
7009
0
    const char *end;
7010
0
    PyObject *errorHandler = NULL;
7011
0
    PyObject *exc = NULL;
7012
7013
0
    if (size == 0) {
7014
0
        if (consumed) {
7015
0
            *consumed = 0;
7016
0
        }
7017
0
        _Py_RETURN_UNICODE_EMPTY();
7018
0
    }
7019
7020
    /* Escaped strings will always be longer than the resulting
7021
       Unicode string, so we start with size here and then reduce the
7022
       length after conversion to the true value. (But decoding error
7023
       handler might have to resize the string) */
7024
0
    _PyUnicodeWriter_Init(&writer);
7025
0
    writer.min_length = size;
7026
0
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
7027
0
        goto onError;
7028
0
    }
7029
7030
0
    end = s + size;
7031
0
    while (s < end) {
7032
0
        unsigned char c = (unsigned char) *s++;
7033
0
        Py_UCS4 ch;
7034
0
        int count;
7035
0
        const char *message;
7036
7037
0
#define WRITE_CHAR(ch)                                                        \
7038
0
            do {                                                              \
7039
0
                if (ch <= writer.maxchar) {                                   \
7040
0
                    assert(writer.pos < writer.size);                         \
7041
0
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
7042
0
                }                                                             \
7043
0
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
7044
0
                    goto onError;                                             \
7045
0
                }                                                             \
7046
0
            } while(0)
7047
7048
        /* Non-escape characters are interpreted as Unicode ordinals */
7049
0
        if (c != '\\' || (s >= end && !consumed)) {
7050
0
            WRITE_CHAR(c);
7051
0
            continue;
7052
0
        }
7053
7054
0
        Py_ssize_t startinpos = s - starts - 1;
7055
        /* \ - Escapes */
7056
0
        if (s >= end) {
7057
0
            assert(consumed);
7058
            // Set message to silent compiler warning.
7059
            // Actually it is never used.
7060
0
            message = "\\ at end of string";
7061
0
            goto incomplete;
7062
0
        }
7063
7064
0
        c = (unsigned char) *s++;
7065
0
        if (c == 'u') {
7066
0
            count = 4;
7067
0
            message = "truncated \\uXXXX escape";
7068
0
        }
7069
0
        else if (c == 'U') {
7070
0
            count = 8;
7071
0
            message = "truncated \\UXXXXXXXX escape";
7072
0
        }
7073
0
        else {
7074
0
            assert(writer.pos < writer.size);
7075
0
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
7076
0
            WRITE_CHAR(c);
7077
0
            continue;
7078
0
        }
7079
7080
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
7081
0
        for (ch = 0; count; ++s, --count) {
7082
0
            if (s >= end) {
7083
0
                goto incomplete;
7084
0
            }
7085
0
            c = (unsigned char)*s;
7086
0
            ch <<= 4;
7087
0
            if (c >= '0' && c <= '9') {
7088
0
                ch += c - '0';
7089
0
            }
7090
0
            else if (c >= 'a' && c <= 'f') {
7091
0
                ch += c - ('a' - 10);
7092
0
            }
7093
0
            else if (c >= 'A' && c <= 'F') {
7094
0
                ch += c - ('A' - 10);
7095
0
            }
7096
0
            else {
7097
0
                goto error;
7098
0
            }
7099
0
        }
7100
0
        if (ch > MAX_UNICODE) {
7101
0
            message = "\\Uxxxxxxxx out of range";
7102
0
            goto error;
7103
0
        }
7104
0
        WRITE_CHAR(ch);
7105
0
        continue;
7106
7107
0
      incomplete:
7108
0
        if (consumed) {
7109
0
            *consumed = startinpos;
7110
0
            break;
7111
0
        }
7112
0
      error:;
7113
0
        Py_ssize_t endinpos = s-starts;
7114
0
        writer.min_length = end - s + writer.pos;
7115
0
        if (unicode_decode_call_errorhandler_writer(
7116
0
                errors, &errorHandler,
7117
0
                "rawunicodeescape", message,
7118
0
                &starts, &end, &startinpos, &endinpos, &exc, &s,
7119
0
                &writer)) {
7120
0
            goto onError;
7121
0
        }
7122
0
        assert(end - s <= writer.size - writer.pos);
7123
7124
0
#undef WRITE_CHAR
7125
0
    }
7126
0
    Py_XDECREF(errorHandler);
7127
0
    Py_XDECREF(exc);
7128
0
    return _PyUnicodeWriter_Finish(&writer);
7129
7130
0
  onError:
7131
0
    _PyUnicodeWriter_Dealloc(&writer);
7132
0
    Py_XDECREF(errorHandler);
7133
0
    Py_XDECREF(exc);
7134
0
    return NULL;
7135
0
}
7136
7137
PyObject *
7138
PyUnicode_DecodeRawUnicodeEscape(const char *s,
7139
                                 Py_ssize_t size,
7140
                                 const char *errors)
7141
0
{
7142
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
7143
0
}
7144
7145
7146
PyObject *
7147
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7148
212k
{
7149
212k
    PyObject *repr;
7150
212k
    char *p;
7151
212k
    Py_ssize_t expandsize, pos;
7152
212k
    int kind;
7153
212k
    const void *data;
7154
212k
    Py_ssize_t len;
7155
7156
212k
    if (!PyUnicode_Check(unicode)) {
7157
0
        PyErr_BadArgument();
7158
0
        return NULL;
7159
0
    }
7160
212k
    kind = PyUnicode_KIND(unicode);
7161
212k
    data = PyUnicode_DATA(unicode);
7162
212k
    len = PyUnicode_GET_LENGTH(unicode);
7163
212k
    if (kind == PyUnicode_1BYTE_KIND) {
7164
211k
        return PyBytes_FromStringAndSize(data, len);
7165
211k
    }
7166
7167
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7168
       bytes, and 1 byte characters 4. */
7169
289
    expandsize = kind * 2 + 2;
7170
7171
289
    if (len > PY_SSIZE_T_MAX / expandsize) {
7172
0
        return PyErr_NoMemory();
7173
0
    }
7174
289
    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
7175
289
    if (repr == NULL) {
7176
0
        return NULL;
7177
0
    }
7178
289
    if (len == 0) {
7179
0
        return repr;
7180
0
    }
7181
7182
289
    p = PyBytes_AS_STRING(repr);
7183
4.35M
    for (pos = 0; pos < len; pos++) {
7184
4.35M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7185
7186
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7187
4.35M
        if (ch < 0x100) {
7188
4.31M
            *p++ = (char) ch;
7189
4.31M
        }
7190
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7191
41.2k
        else if (ch < 0x10000) {
7192
40.4k
            *p++ = '\\';
7193
40.4k
            *p++ = 'u';
7194
40.4k
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7195
40.4k
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7196
40.4k
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7197
40.4k
            *p++ = Py_hexdigits[ch & 15];
7198
40.4k
        }
7199
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7200
801
        else {
7201
801
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7202
801
            *p++ = '\\';
7203
801
            *p++ = 'U';
7204
801
            *p++ = '0';
7205
801
            *p++ = '0';
7206
801
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7207
801
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7208
801
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7209
801
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7210
801
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7211
801
            *p++ = Py_hexdigits[ch & 15];
7212
801
        }
7213
4.35M
    }
7214
7215
289
    assert(p > PyBytes_AS_STRING(repr));
7216
289
    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
7217
0
        return NULL;
7218
0
    }
7219
289
    return repr;
7220
289
}
7221
7222
/* --- Latin-1 Codec ------------------------------------------------------ */
7223
7224
PyObject *
7225
PyUnicode_DecodeLatin1(const char *s,
7226
                       Py_ssize_t size,
7227
                       const char *errors)
7228
2.76M
{
7229
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7230
2.76M
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7231
2.76M
}
7232
7233
/* create or adjust a UnicodeEncodeError */
7234
static void
7235
make_encode_exception(PyObject **exceptionObject,
7236
                      const char *encoding,
7237
                      PyObject *unicode,
7238
                      Py_ssize_t startpos, Py_ssize_t endpos,
7239
                      const char *reason)
7240
206k
{
7241
206k
    if (*exceptionObject == NULL) {
7242
206k
        *exceptionObject = PyObject_CallFunction(
7243
206k
            PyExc_UnicodeEncodeError, "sOnns",
7244
206k
            encoding, unicode, startpos, endpos, reason);
7245
206k
    }
7246
0
    else {
7247
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7248
0
            goto onError;
7249
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7250
0
            goto onError;
7251
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7252
0
            goto onError;
7253
0
        return;
7254
0
      onError:
7255
0
        Py_CLEAR(*exceptionObject);
7256
0
    }
7257
206k
}
7258
7259
/* raises a UnicodeEncodeError */
7260
static void
7261
raise_encode_exception(PyObject **exceptionObject,
7262
                       const char *encoding,
7263
                       PyObject *unicode,
7264
                       Py_ssize_t startpos, Py_ssize_t endpos,
7265
                       const char *reason)
7266
37.4k
{
7267
37.4k
    make_encode_exception(exceptionObject,
7268
37.4k
                          encoding, unicode, startpos, endpos, reason);
7269
37.4k
    if (*exceptionObject != NULL)
7270
37.4k
        PyCodec_StrictErrors(*exceptionObject);
7271
37.4k
}
7272
7273
/* error handling callback helper:
7274
   build arguments, call the callback and check the arguments,
7275
   put the result into newpos and return the replacement string, which
7276
   has to be freed by the caller */
7277
static PyObject *
7278
unicode_encode_call_errorhandler(const char *errors,
7279
                                 PyObject **errorHandler,
7280
                                 const char *encoding, const char *reason,
7281
                                 PyObject *unicode, PyObject **exceptionObject,
7282
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7283
                                 Py_ssize_t *newpos)
7284
168k
{
7285
168k
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7286
168k
    Py_ssize_t len;
7287
168k
    PyObject *restuple;
7288
168k
    PyObject *resunicode;
7289
7290
168k
    if (*errorHandler == NULL) {
7291
168k
        *errorHandler = PyCodec_LookupError(errors);
7292
168k
        if (*errorHandler == NULL)
7293
0
            return NULL;
7294
168k
    }
7295
7296
168k
    len = PyUnicode_GET_LENGTH(unicode);
7297
7298
168k
    make_encode_exception(exceptionObject,
7299
168k
                          encoding, unicode, startpos, endpos, reason);
7300
168k
    if (*exceptionObject == NULL)
7301
0
        return NULL;
7302
7303
168k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7304
168k
    if (restuple == NULL)
7305
168k
        return NULL;
7306
0
    if (!PyTuple_Check(restuple)) {
7307
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7308
0
        Py_DECREF(restuple);
7309
0
        return NULL;
7310
0
    }
7311
0
    if (!PyArg_ParseTuple(restuple, argparse,
7312
0
                          &resunicode, newpos)) {
7313
0
        Py_DECREF(restuple);
7314
0
        return NULL;
7315
0
    }
7316
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7317
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7318
0
        Py_DECREF(restuple);
7319
0
        return NULL;
7320
0
    }
7321
0
    if (*newpos<0)
7322
0
        *newpos = len + *newpos;
7323
0
    if (*newpos<0 || *newpos>len) {
7324
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7325
0
        Py_DECREF(restuple);
7326
0
        return NULL;
7327
0
    }
7328
0
    Py_INCREF(resunicode);
7329
0
    Py_DECREF(restuple);
7330
0
    return resunicode;
7331
0
}
7332
7333
static PyObject *
7334
unicode_encode_ucs1(PyObject *unicode,
7335
                    const char *errors,
7336
                    const Py_UCS4 limit)
7337
46.6k
{
7338
    /* input state */
7339
46.6k
    Py_ssize_t pos=0, size;
7340
46.6k
    int kind;
7341
46.6k
    const void *data;
7342
    /* pointer into the output */
7343
46.6k
    char *str;
7344
46.6k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7345
46.6k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7346
46.6k
    PyObject *error_handler_obj = NULL;
7347
46.6k
    PyObject *exc = NULL;
7348
46.6k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7349
46.6k
    PyObject *rep = NULL;
7350
    /* output object */
7351
46.6k
    _PyBytesWriter writer;
7352
7353
46.6k
    size = PyUnicode_GET_LENGTH(unicode);
7354
46.6k
    kind = PyUnicode_KIND(unicode);
7355
46.6k
    data = PyUnicode_DATA(unicode);
7356
    /* allocate enough for a simple encoding without
7357
       replacements, if we need more, we'll resize */
7358
46.6k
    if (size == 0)
7359
0
        return PyBytes_FromStringAndSize(NULL, 0);
7360
7361
46.6k
    _PyBytesWriter_Init(&writer);
7362
46.6k
    str = _PyBytesWriter_Alloc(&writer, size);
7363
46.6k
    if (str == NULL)
7364
0
        return NULL;
7365
7366
3.44M
    while (pos < size) {
7367
3.44M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7368
7369
        /* can we encode this? */
7370
3.44M
        if (ch < limit) {
7371
            /* no overflow check, because we know that the space is enough */
7372
3.40M
            *str++ = (char)ch;
7373
3.40M
            ++pos;
7374
3.40M
        }
7375
46.6k
        else {
7376
46.6k
            Py_ssize_t newpos, i;
7377
            /* startpos for collecting unencodable chars */
7378
46.6k
            Py_ssize_t collstart = pos;
7379
46.6k
            Py_ssize_t collend = collstart + 1;
7380
            /* find all unecodable characters */
7381
7382
381k
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7383
334k
                ++collend;
7384
7385
            /* Only overallocate the buffer if it's not the last write */
7386
46.6k
            writer.overallocate = (collend < size);
7387
7388
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7389
46.6k
            if (error_handler == _Py_ERROR_UNKNOWN)
7390
46.6k
                error_handler = _Py_GetErrorHandler(errors);
7391
7392
46.6k
            switch (error_handler) {
7393
37.4k
            case _Py_ERROR_STRICT:
7394
37.4k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7395
37.4k
                goto onError;
7396
7397
0
            case _Py_ERROR_REPLACE:
7398
0
                memset(str, '?', collend - collstart);
7399
0
                str += (collend - collstart);
7400
0
                _Py_FALLTHROUGH;
7401
0
            case _Py_ERROR_IGNORE:
7402
0
                pos = collend;
7403
0
                break;
7404
7405
0
            case _Py_ERROR_BACKSLASHREPLACE:
7406
                /* subtract preallocated bytes */
7407
0
                writer.min_size -= (collend - collstart);
7408
0
                str = backslashreplace(&writer, str,
7409
0
                                       unicode, collstart, collend);
7410
0
                if (str == NULL)
7411
0
                    goto onError;
7412
0
                pos = collend;
7413
0
                break;
7414
7415
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7416
                /* subtract preallocated bytes */
7417
0
                writer.min_size -= (collend - collstart);
7418
0
                str = xmlcharrefreplace(&writer, str,
7419
0
                                        unicode, collstart, collend);
7420
0
                if (str == NULL)
7421
0
                    goto onError;
7422
0
                pos = collend;
7423
0
                break;
7424
7425
9.19k
            case _Py_ERROR_SURROGATEESCAPE:
7426
9.19k
                for (i = collstart; i < collend; ++i) {
7427
9.19k
                    ch = PyUnicode_READ(kind, data, i);
7428
9.19k
                    if (ch < 0xdc80 || 0xdcff < ch) {
7429
                        /* Not a UTF-8b surrogate */
7430
9.19k
                        break;
7431
9.19k
                    }
7432
0
                    *str++ = (char)(ch - 0xdc00);
7433
0
                    ++pos;
7434
0
                }
7435
9.19k
                if (i >= collend)
7436
0
                    break;
7437
9.19k
                collstart = pos;
7438
9.19k
                assert(collstart != collend);
7439
9.19k
                _Py_FALLTHROUGH;
7440
7441
9.19k
            default:
7442
9.19k
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7443
9.19k
                                                       encoding, reason, unicode, &exc,
7444
9.19k
                                                       collstart, collend, &newpos);
7445
9.19k
                if (rep == NULL)
7446
9.19k
                    goto onError;
7447
7448
0
                if (newpos < collstart) {
7449
0
                    writer.overallocate = 1;
7450
0
                    str = _PyBytesWriter_Prepare(&writer, str,
7451
0
                                                 collstart - newpos);
7452
0
                    if (str == NULL)
7453
0
                        goto onError;
7454
0
                }
7455
0
                else {
7456
                    /* subtract preallocated bytes */
7457
0
                    writer.min_size -= newpos - collstart;
7458
                    /* Only overallocate the buffer if it's not the last write */
7459
0
                    writer.overallocate = (newpos < size);
7460
0
                }
7461
7462
0
                if (PyBytes_Check(rep)) {
7463
                    /* Directly copy bytes result to output. */
7464
0
                    str = _PyBytesWriter_WriteBytes(&writer, str,
7465
0
                                                    PyBytes_AS_STRING(rep),
7466
0
                                                    PyBytes_GET_SIZE(rep));
7467
0
                }
7468
0
                else {
7469
0
                    assert(PyUnicode_Check(rep));
7470
7471
0
                    if (limit == 256 ?
7472
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7473
0
                        !PyUnicode_IS_ASCII(rep))
7474
0
                    {
7475
                        /* Not all characters are smaller than limit */
7476
0
                        raise_encode_exception(&exc, encoding, unicode,
7477
0
                                               collstart, collend, reason);
7478
0
                        goto onError;
7479
0
                    }
7480
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7481
0
                    str = _PyBytesWriter_WriteBytes(&writer, str,
7482
0
                                                    PyUnicode_DATA(rep),
7483
0
                                                    PyUnicode_GET_LENGTH(rep));
7484
0
                }
7485
0
                if (str == NULL)
7486
0
                    goto onError;
7487
7488
0
                pos = newpos;
7489
0
                Py_CLEAR(rep);
7490
46.6k
            }
7491
7492
            /* If overallocation was disabled, ensure that it was the last
7493
               write. Otherwise, we missed an optimization */
7494
0
            assert(writer.overallocate || pos == size);
7495
0
        }
7496
3.44M
    }
7497
7498
0
    Py_XDECREF(error_handler_obj);
7499
0
    Py_XDECREF(exc);
7500
0
    return _PyBytesWriter_Finish(&writer, str);
7501
7502
46.6k
  onError:
7503
46.6k
    Py_XDECREF(rep);
7504
46.6k
    _PyBytesWriter_Dealloc(&writer);
7505
46.6k
    Py_XDECREF(error_handler_obj);
7506
46.6k
    Py_XDECREF(exc);
7507
46.6k
    return NULL;
7508
46.6k
}
7509
7510
PyObject *
7511
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7512
0
{
7513
0
    if (!PyUnicode_Check(unicode)) {
7514
0
        PyErr_BadArgument();
7515
0
        return NULL;
7516
0
    }
7517
    /* Fast path: if it is a one-byte string, construct
7518
       bytes object directly. */
7519
0
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7520
0
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7521
0
                                         PyUnicode_GET_LENGTH(unicode));
7522
    /* Non-Latin-1 characters present. Defer to above function to
7523
       raise the exception. */
7524
0
    return unicode_encode_ucs1(unicode, errors, 256);
7525
0
}
7526
7527
PyObject*
7528
PyUnicode_AsLatin1String(PyObject *unicode)
7529
0
{
7530
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7531
0
}
7532
7533
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7534
7535
PyObject *
7536
PyUnicode_DecodeASCII(const char *s,
7537
                      Py_ssize_t size,
7538
                      const char *errors)
7539
742k
{
7540
742k
    const char *starts = s;
7541
742k
    const char *e = s + size;
7542
742k
    PyObject *error_handler_obj = NULL;
7543
742k
    PyObject *exc = NULL;
7544
742k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7545
7546
742k
    if (size == 0)
7547
0
        _Py_RETURN_UNICODE_EMPTY();
7548
7549
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7550
742k
    if (size == 1 && (unsigned char)s[0] < 128) {
7551
7.29k
        return get_latin1_char((unsigned char)s[0]);
7552
7.29k
    }
7553
7554
    // Shortcut for simple case
7555
735k
    PyObject *u = PyUnicode_New(size, 127);
7556
735k
    if (u == NULL) {
7557
0
        return NULL;
7558
0
    }
7559
735k
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7560
735k
    if (outpos == size) {
7561
582k
        return u;
7562
582k
    }
7563
7564
152k
    _PyUnicodeWriter writer;
7565
152k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7566
152k
    writer.pos = outpos;
7567
7568
152k
    s += outpos;
7569
152k
    int kind = writer.kind;
7570
152k
    void *data = writer.data;
7571
152k
    Py_ssize_t startinpos, endinpos;
7572
7573
21.4M
    while (s < e) {
7574
21.3M
        unsigned char c = (unsigned char)*s;
7575
21.3M
        if (c < 128) {
7576
6.84M
            PyUnicode_WRITE(kind, data, writer.pos, c);
7577
6.84M
            writer.pos++;
7578
6.84M
            ++s;
7579
6.84M
            continue;
7580
6.84M
        }
7581
7582
        /* byte outsize range 0x00..0x7f: call the error handler */
7583
7584
14.4M
        if (error_handler == _Py_ERROR_UNKNOWN)
7585
152k
            error_handler = _Py_GetErrorHandler(errors);
7586
7587
14.4M
        switch (error_handler)
7588
14.4M
        {
7589
771k
        case _Py_ERROR_REPLACE:
7590
14.4M
        case _Py_ERROR_SURROGATEESCAPE:
7591
            /* Fast-path: the error handler only writes one character,
7592
               but we may switch to UCS2 at the first write */
7593
14.4M
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7594
0
                goto onError;
7595
14.4M
            kind = writer.kind;
7596
14.4M
            data = writer.data;
7597
7598
14.4M
            if (error_handler == _Py_ERROR_REPLACE)
7599
771k
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7600
13.6M
            else
7601
13.6M
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7602
14.4M
            writer.pos++;
7603
14.4M
            ++s;
7604
14.4M
            break;
7605
7606
0
        case _Py_ERROR_IGNORE:
7607
0
            ++s;
7608
0
            break;
7609
7610
6.17k
        default:
7611
6.17k
            startinpos = s-starts;
7612
6.17k
            endinpos = startinpos + 1;
7613
6.17k
            if (unicode_decode_call_errorhandler_writer(
7614
6.17k
                    errors, &error_handler_obj,
7615
6.17k
                    "ascii", "ordinal not in range(128)",
7616
6.17k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7617
6.17k
                    &writer))
7618
6.17k
                goto onError;
7619
0
            kind = writer.kind;
7620
0
            data = writer.data;
7621
14.4M
        }
7622
14.4M
    }
7623
146k
    Py_XDECREF(error_handler_obj);
7624
146k
    Py_XDECREF(exc);
7625
146k
    return _PyUnicodeWriter_Finish(&writer);
7626
7627
6.17k
  onError:
7628
6.17k
    _PyUnicodeWriter_Dealloc(&writer);
7629
6.17k
    Py_XDECREF(error_handler_obj);
7630
6.17k
    Py_XDECREF(exc);
7631
6.17k
    return NULL;
7632
152k
}
7633
7634
PyObject *
7635
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7636
98.2k
{
7637
98.2k
    if (!PyUnicode_Check(unicode)) {
7638
0
        PyErr_BadArgument();
7639
0
        return NULL;
7640
0
    }
7641
    /* Fast path: if it is an ASCII-only string, construct bytes object
7642
       directly. Else defer to above function to raise the exception. */
7643
98.2k
    if (PyUnicode_IS_ASCII(unicode))
7644
51.5k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7645
51.5k
                                         PyUnicode_GET_LENGTH(unicode));
7646
46.6k
    return unicode_encode_ucs1(unicode, errors, 128);
7647
98.2k
}
7648
7649
PyObject *
7650
PyUnicode_AsASCIIString(PyObject *unicode)
7651
4
{
7652
4
    return _PyUnicode_AsASCIIString(unicode, NULL);
7653
4
}
7654
7655
#ifdef MS_WINDOWS
7656
7657
/* --- MBCS codecs for Windows -------------------------------------------- */
7658
7659
#if SIZEOF_INT < SIZEOF_SIZE_T
7660
#define NEED_RETRY
7661
#endif
7662
7663
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7664
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7665
   both cases also and avoids partial characters overrunning the
7666
   length limit in MultiByteToWideChar on Windows */
7667
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7668
7669
#ifndef WC_ERR_INVALID_CHARS
7670
#  define WC_ERR_INVALID_CHARS 0x0080
7671
#endif
7672
7673
static const char*
7674
code_page_name(UINT code_page, PyObject **obj)
7675
{
7676
    *obj = NULL;
7677
    if (code_page == CP_ACP)
7678
        return "mbcs";
7679
7680
    *obj = PyBytes_FromFormat("cp%u", code_page);
7681
    if (*obj == NULL)
7682
        return NULL;
7683
    return PyBytes_AS_STRING(*obj);
7684
}
7685
7686
static DWORD
7687
decode_code_page_flags(UINT code_page)
7688
{
7689
    if (code_page == CP_UTF7) {
7690
        /* The CP_UTF7 decoder only supports flags=0 */
7691
        return 0;
7692
    }
7693
    else
7694
        return MB_ERR_INVALID_CHARS;
7695
}
7696
7697
/*
7698
 * Decode a byte string from a Windows code page into unicode object in strict
7699
 * mode.
7700
 *
7701
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7702
 * OSError and returns -1 on other error.
7703
 */
7704
static int
7705
decode_code_page_strict(UINT code_page,
7706
                        wchar_t **buf,
7707
                        Py_ssize_t *bufsize,
7708
                        const char *in,
7709
                        int insize)
7710
{
7711
    DWORD flags = MB_ERR_INVALID_CHARS;
7712
    wchar_t *out;
7713
    DWORD outsize;
7714
7715
    /* First get the size of the result */
7716
    assert(insize > 0);
7717
    while ((outsize = MultiByteToWideChar(code_page, flags,
7718
                                          in, insize, NULL, 0)) <= 0)
7719
    {
7720
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7721
            goto error;
7722
        }
7723
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7724
        flags = 0;
7725
    }
7726
7727
    /* Extend a wchar_t* buffer */
7728
    Py_ssize_t n = *bufsize;   /* Get the current length */
7729
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7730
        return -1;
7731
    }
7732
    out = *buf + n;
7733
7734
    /* Do the conversion */
7735
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7736
    if (outsize <= 0)
7737
        goto error;
7738
    return insize;
7739
7740
error:
7741
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7742
        return -2;
7743
    PyErr_SetFromWindowsErr(0);
7744
    return -1;
7745
}
7746
7747
/*
7748
 * Decode a byte string from a code page into unicode object with an error
7749
 * handler.
7750
 *
7751
 * Returns consumed size if succeed, or raise an OSError or
7752
 * UnicodeDecodeError exception and returns -1 on error.
7753
 */
7754
static int
7755
decode_code_page_errors(UINT code_page,
7756
                        wchar_t **buf,
7757
                        Py_ssize_t *bufsize,
7758
                        const char *in, const int size,
7759
                        const char *errors, int final)
7760
{
7761
    const char *startin = in;
7762
    const char *endin = in + size;
7763
    DWORD flags = MB_ERR_INVALID_CHARS;
7764
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7765
       2000 English version of the message. */
7766
    const char *reason = "No mapping for the Unicode character exists "
7767
                         "in the target code page.";
7768
    /* each step cannot decode more than 1 character, but a character can be
7769
       represented as a surrogate pair */
7770
    wchar_t buffer[2], *out;
7771
    int insize;
7772
    Py_ssize_t outsize;
7773
    PyObject *errorHandler = NULL;
7774
    PyObject *exc = NULL;
7775
    PyObject *encoding_obj = NULL;
7776
    const char *encoding;
7777
    DWORD err;
7778
    int ret = -1;
7779
7780
    assert(size > 0);
7781
7782
    encoding = code_page_name(code_page, &encoding_obj);
7783
    if (encoding == NULL)
7784
        return -1;
7785
7786
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7787
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7788
           UnicodeDecodeError. */
7789
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7790
        if (exc != NULL) {
7791
            PyCodec_StrictErrors(exc);
7792
            Py_CLEAR(exc);
7793
        }
7794
        goto error;
7795
    }
7796
7797
    /* Extend a wchar_t* buffer */
7798
    Py_ssize_t n = *bufsize;   /* Get the current length */
7799
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7800
        PyErr_NoMemory();
7801
        goto error;
7802
    }
7803
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7804
        goto error;
7805
    }
7806
    out = *buf + n;
7807
7808
    /* Decode the byte string character per character */
7809
    while (in < endin)
7810
    {
7811
        /* Decode a character */
7812
        insize = 1;
7813
        do
7814
        {
7815
            outsize = MultiByteToWideChar(code_page, flags,
7816
                                          in, insize,
7817
                                          buffer, Py_ARRAY_LENGTH(buffer));
7818
            if (outsize > 0)
7819
                break;
7820
            err = GetLastError();
7821
            if (err == ERROR_INVALID_FLAGS && flags) {
7822
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7823
                flags = 0;
7824
                continue;
7825
            }
7826
            if (err != ERROR_NO_UNICODE_TRANSLATION
7827
                && err != ERROR_INSUFFICIENT_BUFFER)
7828
            {
7829
                PyErr_SetFromWindowsErr(err);
7830
                goto error;
7831
            }
7832
            insize++;
7833
        }
7834
        /* 4=maximum length of a UTF-8 sequence */
7835
        while (insize <= 4 && (in + insize) <= endin);
7836
7837
        if (outsize <= 0) {
7838
            Py_ssize_t startinpos, endinpos, outpos;
7839
7840
            /* last character in partial decode? */
7841
            if (in + insize >= endin && !final)
7842
                break;
7843
7844
            startinpos = in - startin;
7845
            endinpos = startinpos + 1;
7846
            outpos = out - *buf;
7847
            if (unicode_decode_call_errorhandler_wchar(
7848
                    errors, &errorHandler,
7849
                    encoding, reason,
7850
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7851
                    buf, bufsize, &outpos))
7852
            {
7853
                goto error;
7854
            }
7855
            out = *buf + outpos;
7856
        }
7857
        else {
7858
            in += insize;
7859
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7860
            out += outsize;
7861
        }
7862
    }
7863
7864
    /* Shrink the buffer */
7865
    assert(out - *buf <= *bufsize);
7866
    *bufsize = out - *buf;
7867
    /* (in - startin) <= size and size is an int */
7868
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7869
7870
error:
7871
    Py_XDECREF(encoding_obj);
7872
    Py_XDECREF(errorHandler);
7873
    Py_XDECREF(exc);
7874
    return ret;
7875
}
7876
7877
static PyObject *
7878
decode_code_page_stateful(int code_page,
7879
                          const char *s, Py_ssize_t size,
7880
                          const char *errors, Py_ssize_t *consumed)
7881
{
7882
    wchar_t *buf = NULL;
7883
    Py_ssize_t bufsize = 0;
7884
    int chunk_size, final, converted, done;
7885
7886
    if (code_page < 0) {
7887
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7888
        return NULL;
7889
    }
7890
    if (size < 0) {
7891
        PyErr_BadInternalCall();
7892
        return NULL;
7893
    }
7894
7895
    if (consumed)
7896
        *consumed = 0;
7897
7898
    do
7899
    {
7900
#ifdef NEED_RETRY
7901
        if (size > DECODING_CHUNK_SIZE) {
7902
            chunk_size = DECODING_CHUNK_SIZE;
7903
            final = 0;
7904
            done = 0;
7905
        }
7906
        else
7907
#endif
7908
        {
7909
            chunk_size = (int)size;
7910
            final = (consumed == NULL);
7911
            done = 1;
7912
        }
7913
7914
        if (chunk_size == 0 && done) {
7915
            if (buf != NULL)
7916
                break;
7917
            _Py_RETURN_UNICODE_EMPTY();
7918
        }
7919
7920
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7921
                                            s, chunk_size);
7922
        if (converted == -2)
7923
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7924
                                                s, chunk_size,
7925
                                                errors, final);
7926
        assert(converted != 0 || done);
7927
7928
        if (converted < 0) {
7929
            PyMem_Free(buf);
7930
            return NULL;
7931
        }
7932
7933
        if (consumed)
7934
            *consumed += converted;
7935
7936
        s += converted;
7937
        size -= converted;
7938
    } while (!done);
7939
7940
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7941
    PyMem_Free(buf);
7942
    return v;
7943
}
7944
7945
PyObject *
7946
PyUnicode_DecodeCodePageStateful(int code_page,
7947
                                 const char *s,
7948
                                 Py_ssize_t size,
7949
                                 const char *errors,
7950
                                 Py_ssize_t *consumed)
7951
{
7952
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7953
}
7954
7955
PyObject *
7956
PyUnicode_DecodeMBCSStateful(const char *s,
7957
                             Py_ssize_t size,
7958
                             const char *errors,
7959
                             Py_ssize_t *consumed)
7960
{
7961
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7962
}
7963
7964
PyObject *
7965
PyUnicode_DecodeMBCS(const char *s,
7966
                     Py_ssize_t size,
7967
                     const char *errors)
7968
{
7969
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7970
}
7971
7972
static DWORD
7973
encode_code_page_flags(UINT code_page, const char *errors)
7974
{
7975
    if (code_page == CP_UTF8) {
7976
        return WC_ERR_INVALID_CHARS;
7977
    }
7978
    else if (code_page == CP_UTF7) {
7979
        /* CP_UTF7 only supports flags=0 */
7980
        return 0;
7981
    }
7982
    else {
7983
        if (errors != NULL && strcmp(errors, "replace") == 0)
7984
            return 0;
7985
        else
7986
            return WC_NO_BEST_FIT_CHARS;
7987
    }
7988
}
7989
7990
/*
7991
 * Encode a Unicode string to a Windows code page into a byte string in strict
7992
 * mode.
7993
 *
7994
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7995
 * an OSError and returns -1 on other error.
7996
 */
7997
static int
7998
encode_code_page_strict(UINT code_page, PyObject **outbytes,
7999
                        PyObject *unicode, Py_ssize_t offset, int len,
8000
                        const char* errors)
8001
{
8002
    BOOL usedDefaultChar = FALSE;
8003
    BOOL *pusedDefaultChar = &usedDefaultChar;
8004
    int outsize;
8005
    wchar_t *p;
8006
    Py_ssize_t size;
8007
    const DWORD flags = encode_code_page_flags(code_page, NULL);
8008
    char *out;
8009
    /* Create a substring so that we can get the UTF-16 representation
8010
       of just the slice under consideration. */
8011
    PyObject *substring;
8012
    int ret = -1;
8013
8014
    assert(len > 0);
8015
8016
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
8017
        pusedDefaultChar = &usedDefaultChar;
8018
    else
8019
        pusedDefaultChar = NULL;
8020
8021
    substring = PyUnicode_Substring(unicode, offset, offset+len);
8022
    if (substring == NULL)
8023
        return -1;
8024
    p = PyUnicode_AsWideCharString(substring, &size);
8025
    Py_CLEAR(substring);
8026
    if (p == NULL) {
8027
        return -1;
8028
    }
8029
    assert(size <= INT_MAX);
8030
8031
    /* First get the size of the result */
8032
    outsize = WideCharToMultiByte(code_page, flags,
8033
                                  p, (int)size,
8034
                                  NULL, 0,
8035
                                  NULL, pusedDefaultChar);
8036
    if (outsize <= 0)
8037
        goto error;
8038
    /* If we used a default char, then we failed! */
8039
    if (pusedDefaultChar && *pusedDefaultChar) {
8040
        ret = -2;
8041
        goto done;
8042
    }
8043
8044
    if (*outbytes == NULL) {
8045
        /* Create string object */
8046
        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
8047
        if (*outbytes == NULL) {
8048
            goto done;
8049
        }
8050
        out = PyBytes_AS_STRING(*outbytes);
8051
    }
8052
    else {
8053
        /* Extend string object */
8054
        const Py_ssize_t n = PyBytes_Size(*outbytes);
8055
        if (outsize > PY_SSIZE_T_MAX - n) {
8056
            PyErr_NoMemory();
8057
            goto done;
8058
        }
8059
        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
8060
            goto done;
8061
        }
8062
        out = PyBytes_AS_STRING(*outbytes) + n;
8063
    }
8064
8065
    /* Do the conversion */
8066
    outsize = WideCharToMultiByte(code_page, flags,
8067
                                  p, (int)size,
8068
                                  out, outsize,
8069
                                  NULL, pusedDefaultChar);
8070
    if (outsize <= 0)
8071
        goto error;
8072
    if (pusedDefaultChar && *pusedDefaultChar) {
8073
        ret = -2;
8074
        goto done;
8075
    }
8076
    ret = 0;
8077
8078
done:
8079
    PyMem_Free(p);
8080
    return ret;
8081
8082
error:
8083
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
8084
        ret = -2;
8085
        goto done;
8086
    }
8087
    PyErr_SetFromWindowsErr(0);
8088
    goto done;
8089
}
8090
8091
/*
8092
 * Encode a Unicode string to a Windows code page into a byte string using an
8093
 * error handler.
8094
 *
8095
 * Returns consumed characters if succeed, or raise an OSError and returns
8096
 * -1 on other error.
8097
 */
8098
static int
8099
encode_code_page_errors(UINT code_page, PyObject **outbytes,
8100
                        PyObject *unicode, Py_ssize_t unicode_offset,
8101
                        Py_ssize_t insize, const char* errors)
8102
{
8103
    const DWORD flags = encode_code_page_flags(code_page, errors);
8104
    Py_ssize_t pos = unicode_offset;
8105
    Py_ssize_t endin = unicode_offset + insize;
8106
    /* Ideally, we should get reason from FormatMessage. This is the Windows
8107
       2000 English version of the message. */
8108
    const char *reason = "invalid character";
8109
    /* 4=maximum length of a UTF-8 sequence */
8110
    char buffer[4];
8111
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
8112
    Py_ssize_t outsize;
8113
    char *out;
8114
    PyObject *errorHandler = NULL;
8115
    PyObject *exc = NULL;
8116
    PyObject *encoding_obj = NULL;
8117
    const char *encoding;
8118
    Py_ssize_t newpos, newoutsize;
8119
    PyObject *rep;
8120
    int ret = -1;
8121
8122
    assert(insize > 0);
8123
8124
    encoding = code_page_name(code_page, &encoding_obj);
8125
    if (encoding == NULL)
8126
        return -1;
8127
8128
    if (errors == NULL || strcmp(errors, "strict") == 0) {
8129
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
8130
           then we raise a UnicodeEncodeError. */
8131
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
8132
        if (exc != NULL) {
8133
            PyCodec_StrictErrors(exc);
8134
            Py_DECREF(exc);
8135
        }
8136
        Py_XDECREF(encoding_obj);
8137
        return -1;
8138
    }
8139
8140
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
8141
        pusedDefaultChar = &usedDefaultChar;
8142
    else
8143
        pusedDefaultChar = NULL;
8144
8145
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
8146
        PyErr_NoMemory();
8147
        goto error;
8148
    }
8149
    outsize = insize * Py_ARRAY_LENGTH(buffer);
8150
8151
    if (*outbytes == NULL) {
8152
        /* Create string object */
8153
        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
8154
        if (*outbytes == NULL)
8155
            goto error;
8156
        out = PyBytes_AS_STRING(*outbytes);
8157
    }
8158
    else {
8159
        /* Extend string object */
8160
        Py_ssize_t n = PyBytes_Size(*outbytes);
8161
        if (n > PY_SSIZE_T_MAX - outsize) {
8162
            PyErr_NoMemory();
8163
            goto error;
8164
        }
8165
        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
8166
            goto error;
8167
        out = PyBytes_AS_STRING(*outbytes) + n;
8168
    }
8169
8170
    /* Encode the string character per character */
8171
    while (pos < endin)
8172
    {
8173
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8174
        wchar_t chars[2];
8175
        int charsize;
8176
        if (ch < 0x10000) {
8177
            chars[0] = (wchar_t)ch;
8178
            charsize = 1;
8179
        }
8180
        else {
8181
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8182
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8183
            charsize = 2;
8184
        }
8185
8186
        outsize = WideCharToMultiByte(code_page, flags,
8187
                                      chars, charsize,
8188
                                      buffer, Py_ARRAY_LENGTH(buffer),
8189
                                      NULL, pusedDefaultChar);
8190
        if (outsize > 0) {
8191
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8192
            {
8193
                pos++;
8194
                memcpy(out, buffer, outsize);
8195
                out += outsize;
8196
                continue;
8197
            }
8198
        }
8199
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8200
            PyErr_SetFromWindowsErr(0);
8201
            goto error;
8202
        }
8203
8204
        rep = unicode_encode_call_errorhandler(
8205
                  errors, &errorHandler, encoding, reason,
8206
                  unicode, &exc,
8207
                  pos, pos + 1, &newpos);
8208
        if (rep == NULL)
8209
            goto error;
8210
8211
        Py_ssize_t morebytes = pos - newpos;
8212
        if (PyBytes_Check(rep)) {
8213
            outsize = PyBytes_GET_SIZE(rep);
8214
            morebytes += outsize;
8215
            if (morebytes > 0) {
8216
                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8217
                newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
8218
                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8219
                    Py_DECREF(rep);
8220
                    goto error;
8221
                }
8222
                out = PyBytes_AS_STRING(*outbytes) + offset;
8223
            }
8224
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8225
            out += outsize;
8226
        }
8227
        else {
8228
            Py_ssize_t i;
8229
            int kind;
8230
            const void *data;
8231
8232
            outsize = PyUnicode_GET_LENGTH(rep);
8233
            morebytes += outsize;
8234
            if (morebytes > 0) {
8235
                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8236
                newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
8237
                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8238
                    Py_DECREF(rep);
8239
                    goto error;
8240
                }
8241
                out = PyBytes_AS_STRING(*outbytes) + offset;
8242
            }
8243
            kind = PyUnicode_KIND(rep);
8244
            data = PyUnicode_DATA(rep);
8245
            for (i=0; i < outsize; i++) {
8246
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8247
                if (ch > 127) {
8248
                    raise_encode_exception(&exc,
8249
                        encoding, unicode,
8250
                        pos, pos + 1,
8251
                        "unable to encode error handler result to ASCII");
8252
                    Py_DECREF(rep);
8253
                    goto error;
8254
                }
8255
                *out = (unsigned char)ch;
8256
                out++;
8257
            }
8258
        }
8259
        pos = newpos;
8260
        Py_DECREF(rep);
8261
    }
8262
    /* write a NUL byte */
8263
    *out = 0;
8264
    outsize = out - PyBytes_AS_STRING(*outbytes);
8265
    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8266
    if (_PyBytes_Resize(outbytes, outsize) < 0)
8267
        goto error;
8268
    ret = 0;
8269
8270
error:
8271
    Py_XDECREF(encoding_obj);
8272
    Py_XDECREF(errorHandler);
8273
    Py_XDECREF(exc);
8274
    return ret;
8275
}
8276
8277
static PyObject *
8278
encode_code_page(int code_page,
8279
                 PyObject *unicode,
8280
                 const char *errors)
8281
{
8282
    Py_ssize_t len;
8283
    PyObject *outbytes = NULL;
8284
    Py_ssize_t offset;
8285
    int chunk_len, ret, done;
8286
8287
    if (!PyUnicode_Check(unicode)) {
8288
        PyErr_BadArgument();
8289
        return NULL;
8290
    }
8291
8292
    len = PyUnicode_GET_LENGTH(unicode);
8293
8294
    if (code_page < 0) {
8295
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8296
        return NULL;
8297
    }
8298
8299
    if (len == 0)
8300
        return PyBytes_FromStringAndSize(NULL, 0);
8301
8302
    offset = 0;
8303
    do
8304
    {
8305
#ifdef NEED_RETRY
8306
        if (len > DECODING_CHUNK_SIZE) {
8307
            chunk_len = DECODING_CHUNK_SIZE;
8308
            done = 0;
8309
        }
8310
        else
8311
#endif
8312
        {
8313
            chunk_len = (int)len;
8314
            done = 1;
8315
        }
8316
8317
        ret = encode_code_page_strict(code_page, &outbytes,
8318
                                      unicode, offset, chunk_len,
8319
                                      errors);
8320
        if (ret == -2)
8321
            ret = encode_code_page_errors(code_page, &outbytes,
8322
                                          unicode, offset,
8323
                                          chunk_len, errors);
8324
        if (ret < 0) {
8325
            Py_XDECREF(outbytes);
8326
            return NULL;
8327
        }
8328
8329
        offset += chunk_len;
8330
        len -= chunk_len;
8331
    } while (!done);
8332
8333
    return outbytes;
8334
}
8335
8336
PyObject *
8337
PyUnicode_EncodeCodePage(int code_page,
8338
                         PyObject *unicode,
8339
                         const char *errors)
8340
{
8341
    return encode_code_page(code_page, unicode, errors);
8342
}
8343
8344
PyObject *
8345
PyUnicode_AsMBCSString(PyObject *unicode)
8346
{
8347
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8348
}
8349
8350
#undef NEED_RETRY
8351
8352
#endif /* MS_WINDOWS */
8353
8354
/* --- Character Mapping Codec -------------------------------------------- */
8355
8356
static int
8357
charmap_decode_string(const char *s,
8358
                      Py_ssize_t size,
8359
                      PyObject *mapping,
8360
                      const char *errors,
8361
                      _PyUnicodeWriter *writer)
8362
17.4k
{
8363
17.4k
    const char *starts = s;
8364
17.4k
    const char *e;
8365
17.4k
    Py_ssize_t startinpos, endinpos;
8366
17.4k
    PyObject *errorHandler = NULL, *exc = NULL;
8367
17.4k
    Py_ssize_t maplen;
8368
17.4k
    int mapkind;
8369
17.4k
    const void *mapdata;
8370
17.4k
    Py_UCS4 x;
8371
17.4k
    unsigned char ch;
8372
8373
17.4k
    maplen = PyUnicode_GET_LENGTH(mapping);
8374
17.4k
    mapdata = PyUnicode_DATA(mapping);
8375
17.4k
    mapkind = PyUnicode_KIND(mapping);
8376
8377
17.4k
    e = s + size;
8378
8379
17.4k
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8380
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8381
         * is disabled in encoding aliases, latin1 is preferred because
8382
         * its implementation is faster. */
8383
122
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8384
122
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8385
122
        Py_UCS4 maxchar = writer->maxchar;
8386
8387
122
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8388
2.59k
        while (s < e) {
8389
2.47k
            ch = *s;
8390
2.47k
            x = mapdata_ucs1[ch];
8391
2.47k
            if (x > maxchar) {
8392
113
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8393
0
                    goto onError;
8394
113
                maxchar = writer->maxchar;
8395
113
                outdata = (Py_UCS1 *)writer->data;
8396
113
            }
8397
2.47k
            outdata[writer->pos] = x;
8398
2.47k
            writer->pos++;
8399
2.47k
            ++s;
8400
2.47k
        }
8401
122
        return 0;
8402
122
    }
8403
8404
72.2k
    while (s < e) {
8405
62.6k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8406
62.6k
            int outkind = writer->kind;
8407
62.6k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8408
62.6k
            if (outkind == PyUnicode_1BYTE_KIND) {
8409
33.3k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8410
33.3k
                Py_UCS4 maxchar = writer->maxchar;
8411
114k
                while (s < e) {
8412
113k
                    ch = *s;
8413
113k
                    x = mapdata_ucs2[ch];
8414
113k
                    if (x > maxchar)
8415
31.8k
                        goto Error;
8416
81.4k
                    outdata[writer->pos] = x;
8417
81.4k
                    writer->pos++;
8418
81.4k
                    ++s;
8419
81.4k
                }
8420
1.56k
                break;
8421
33.3k
            }
8422
29.2k
            else if (outkind == PyUnicode_2BYTE_KIND) {
8423
29.2k
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8424
652k
                while (s < e) {
8425
645k
                    ch = *s;
8426
645k
                    x = mapdata_ucs2[ch];
8427
645k
                    if (x == 0xFFFE)
8428
23.0k
                        goto Error;
8429
622k
                    outdata[writer->pos] = x;
8430
622k
                    writer->pos++;
8431
622k
                    ++s;
8432
622k
                }
8433
6.15k
                break;
8434
29.2k
            }
8435
62.6k
        }
8436
0
        ch = *s;
8437
8438
0
        if (ch < maplen)
8439
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8440
0
        else
8441
0
            x = 0xfffe; /* invalid value */
8442
54.9k
Error:
8443
54.9k
        if (x == 0xfffe)
8444
37.2k
        {
8445
            /* undefined mapping */
8446
37.2k
            startinpos = s-starts;
8447
37.2k
            endinpos = startinpos+1;
8448
37.2k
            if (unicode_decode_call_errorhandler_writer(
8449
37.2k
                    errors, &errorHandler,
8450
37.2k
                    "charmap", "character maps to <undefined>",
8451
37.2k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8452
37.2k
                    writer)) {
8453
16
                goto onError;
8454
16
            }
8455
37.2k
            continue;
8456
37.2k
        }
8457
8458
17.6k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8459
0
            goto onError;
8460
17.6k
        ++s;
8461
17.6k
    }
8462
17.3k
    Py_XDECREF(errorHandler);
8463
17.3k
    Py_XDECREF(exc);
8464
17.3k
    return 0;
8465
8466
16
onError:
8467
16
    Py_XDECREF(errorHandler);
8468
16
    Py_XDECREF(exc);
8469
16
    return -1;
8470
17.3k
}
8471
8472
static int
8473
charmap_decode_mapping(const char *s,
8474
                       Py_ssize_t size,
8475
                       PyObject *mapping,
8476
                       const char *errors,
8477
                       _PyUnicodeWriter *writer)
8478
0
{
8479
0
    const char *starts = s;
8480
0
    const char *e;
8481
0
    Py_ssize_t startinpos, endinpos;
8482
0
    PyObject *errorHandler = NULL, *exc = NULL;
8483
0
    unsigned char ch;
8484
0
    PyObject *key, *item = NULL;
8485
8486
0
    e = s + size;
8487
8488
0
    while (s < e) {
8489
0
        ch = *s;
8490
8491
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8492
0
        key = PyLong_FromLong((long)ch);
8493
0
        if (key == NULL)
8494
0
            goto onError;
8495
8496
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8497
0
        Py_DECREF(key);
8498
0
        if (rc == 0) {
8499
            /* No mapping found means: mapping is undefined. */
8500
0
            goto Undefined;
8501
0
        }
8502
0
        if (item == NULL) {
8503
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8504
                /* No mapping found means: mapping is undefined. */
8505
0
                PyErr_Clear();
8506
0
                goto Undefined;
8507
0
            } else
8508
0
                goto onError;
8509
0
        }
8510
8511
        /* Apply mapping */
8512
0
        if (item == Py_None)
8513
0
            goto Undefined;
8514
0
        if (PyLong_Check(item)) {
8515
0
            long value = PyLong_AsLong(item);
8516
0
            if (value == 0xFFFE)
8517
0
                goto Undefined;
8518
0
            if (value < 0 || value > MAX_UNICODE) {
8519
0
                PyErr_Format(PyExc_TypeError,
8520
0
                             "character mapping must be in range(0x%x)",
8521
0
                             (unsigned long)MAX_UNICODE + 1);
8522
0
                goto onError;
8523
0
            }
8524
8525
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8526
0
                goto onError;
8527
0
        }
8528
0
        else if (PyUnicode_Check(item)) {
8529
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8530
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8531
0
                if (value == 0xFFFE)
8532
0
                    goto Undefined;
8533
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8534
0
                    goto onError;
8535
0
            }
8536
0
            else {
8537
0
                writer->overallocate = 1;
8538
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8539
0
                    goto onError;
8540
0
            }
8541
0
        }
8542
0
        else {
8543
            /* wrong return value */
8544
0
            PyErr_SetString(PyExc_TypeError,
8545
0
                            "character mapping must return integer, None or str");
8546
0
            goto onError;
8547
0
        }
8548
0
        Py_CLEAR(item);
8549
0
        ++s;
8550
0
        continue;
8551
8552
0
Undefined:
8553
        /* undefined mapping */
8554
0
        Py_CLEAR(item);
8555
0
        startinpos = s-starts;
8556
0
        endinpos = startinpos+1;
8557
0
        if (unicode_decode_call_errorhandler_writer(
8558
0
                errors, &errorHandler,
8559
0
                "charmap", "character maps to <undefined>",
8560
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8561
0
                writer)) {
8562
0
            goto onError;
8563
0
        }
8564
0
    }
8565
0
    Py_XDECREF(errorHandler);
8566
0
    Py_XDECREF(exc);
8567
0
    return 0;
8568
8569
0
onError:
8570
0
    Py_XDECREF(item);
8571
0
    Py_XDECREF(errorHandler);
8572
0
    Py_XDECREF(exc);
8573
0
    return -1;
8574
0
}
8575
8576
PyObject *
8577
PyUnicode_DecodeCharmap(const char *s,
8578
                        Py_ssize_t size,
8579
                        PyObject *mapping,
8580
                        const char *errors)
8581
17.4k
{
8582
17.4k
    _PyUnicodeWriter writer;
8583
8584
    /* Default to Latin-1 */
8585
17.4k
    if (mapping == NULL)
8586
0
        return PyUnicode_DecodeLatin1(s, size, errors);
8587
8588
17.4k
    if (size == 0)
8589
0
        _Py_RETURN_UNICODE_EMPTY();
8590
17.4k
    _PyUnicodeWriter_Init(&writer);
8591
17.4k
    writer.min_length = size;
8592
17.4k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8593
0
        goto onError;
8594
8595
17.4k
    if (PyUnicode_CheckExact(mapping)) {
8596
17.4k
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8597
16
            goto onError;
8598
17.4k
    }
8599
0
    else {
8600
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8601
0
            goto onError;
8602
0
    }
8603
17.4k
    return _PyUnicodeWriter_Finish(&writer);
8604
8605
16
  onError:
8606
16
    _PyUnicodeWriter_Dealloc(&writer);
8607
16
    return NULL;
8608
17.4k
}
8609
8610
/* Charmap encoding: the lookup table */
8611
8612
/*[clinic input]
8613
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8614
[clinic start generated code]*/
8615
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8616
8617
struct encoding_map {
8618
    PyObject_HEAD
8619
    unsigned char level1[32];
8620
    int count2, count3;
8621
    unsigned char level23[1];
8622
};
8623
8624
/*[clinic input]
8625
EncodingMap.size
8626
8627
Return the size (in bytes) of this object.
8628
[clinic start generated code]*/
8629
8630
static PyObject *
8631
EncodingMap_size_impl(struct encoding_map *self)
8632
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8633
0
{
8634
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8635
0
                           128*self->count3);
8636
0
}
8637
8638
static PyMethodDef encoding_map_methods[] = {
8639
    ENCODINGMAP_SIZE_METHODDEF
8640
    {NULL, NULL}
8641
};
8642
8643
static PyTypeObject EncodingMapType = {
8644
    PyVarObject_HEAD_INIT(NULL, 0)
8645
    .tp_name = "EncodingMap",
8646
    .tp_basicsize = sizeof(struct encoding_map),
8647
    /* methods */
8648
    .tp_flags = Py_TPFLAGS_DEFAULT,
8649
    .tp_methods = encoding_map_methods,
8650
};
8651
8652
PyObject*
8653
PyUnicode_BuildEncodingMap(PyObject* string)
8654
114
{
8655
114
    PyObject *result;
8656
114
    struct encoding_map *mresult;
8657
114
    int i;
8658
114
    int need_dict = 0;
8659
114
    unsigned char level1[32];
8660
114
    unsigned char level2[512];
8661
114
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8662
114
    int count2 = 0, count3 = 0;
8663
114
    int kind;
8664
114
    const void *data;
8665
114
    int length;
8666
114
    Py_UCS4 ch;
8667
8668
114
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8669
0
        PyErr_BadArgument();
8670
0
        return NULL;
8671
0
    }
8672
114
    kind = PyUnicode_KIND(string);
8673
114
    data = PyUnicode_DATA(string);
8674
114
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8675
114
    memset(level1, 0xFF, sizeof level1);
8676
114
    memset(level2, 0xFF, sizeof level2);
8677
8678
    /* If there isn't a one-to-one mapping of NULL to \0,
8679
       or if there are non-BMP characters, we need to use
8680
       a mapping dictionary. */
8681
114
    if (PyUnicode_READ(kind, data, 0) != 0)
8682
0
        need_dict = 1;
8683
29.1k
    for (i = 1; i < length; i++) {
8684
29.0k
        int l1, l2;
8685
29.0k
        ch = PyUnicode_READ(kind, data, i);
8686
29.0k
        if (ch == 0 || ch > 0xFFFF) {
8687
0
            need_dict = 1;
8688
0
            break;
8689
0
        }
8690
29.0k
        if (ch == 0xFFFE)
8691
            /* unmapped character */
8692
739
            continue;
8693
28.3k
        l1 = ch >> 11;
8694
28.3k
        l2 = ch >> 7;
8695
28.3k
        if (level1[l1] == 0xFF)
8696
207
            level1[l1] = count2++;
8697
28.3k
        if (level2[l2] == 0xFF)
8698
623
            level2[l2] = count3++;
8699
28.3k
    }
8700
8701
114
    if (count2 >= 0xFF || count3 >= 0xFF)
8702
0
        need_dict = 1;
8703
8704
114
    if (need_dict) {
8705
0
        PyObject *result = PyDict_New();
8706
0
        if (!result)
8707
0
            return NULL;
8708
0
        for (i = 0; i < length; i++) {
8709
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8710
0
            PyObject *key = PyLong_FromLong(c);
8711
0
            if (key == NULL) {
8712
0
                Py_DECREF(result);
8713
0
                return NULL;
8714
0
            }
8715
0
            PyObject *value = PyLong_FromLong(i);
8716
0
            if (value == NULL) {
8717
0
                Py_DECREF(key);
8718
0
                Py_DECREF(result);
8719
0
                return NULL;
8720
0
            }
8721
0
            int rc = PyDict_SetItem(result, key, value);
8722
0
            Py_DECREF(key);
8723
0
            Py_DECREF(value);
8724
0
            if (rc < 0) {
8725
0
                Py_DECREF(result);
8726
0
                return NULL;
8727
0
            }
8728
0
        }
8729
0
        return result;
8730
0
    }
8731
8732
    /* Create a three-level trie */
8733
114
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8734
114
                             16*count2 + 128*count3 - 1);
8735
114
    if (!result) {
8736
0
        return PyErr_NoMemory();
8737
0
    }
8738
8739
114
    _PyObject_Init(result, &EncodingMapType);
8740
114
    mresult = (struct encoding_map*)result;
8741
114
    mresult->count2 = count2;
8742
114
    mresult->count3 = count3;
8743
114
    mlevel1 = mresult->level1;
8744
114
    mlevel2 = mresult->level23;
8745
114
    mlevel3 = mresult->level23 + 16*count2;
8746
114
    memcpy(mlevel1, level1, 32);
8747
114
    memset(mlevel2, 0xFF, 16*count2);
8748
114
    memset(mlevel3, 0, 128*count3);
8749
114
    count3 = 0;
8750
29.1k
    for (i = 1; i < length; i++) {
8751
29.0k
        int o1, o2, o3, i2, i3;
8752
29.0k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8753
29.0k
        if (ch == 0xFFFE)
8754
            /* unmapped character */
8755
739
            continue;
8756
28.3k
        o1 = ch>>11;
8757
28.3k
        o2 = (ch>>7) & 0xF;
8758
28.3k
        i2 = 16*mlevel1[o1] + o2;
8759
28.3k
        if (mlevel2[i2] == 0xFF)
8760
623
            mlevel2[i2] = count3++;
8761
28.3k
        o3 = ch & 0x7F;
8762
28.3k
        i3 = 128*mlevel2[i2] + o3;
8763
28.3k
        mlevel3[i3] = i;
8764
28.3k
    }
8765
114
    return result;
8766
114
}
8767
8768
static int
8769
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8770
0
{
8771
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8772
0
    int l1 = c>>11;
8773
0
    int l2 = (c>>7) & 0xF;
8774
0
    int l3 = c & 0x7F;
8775
0
    int i;
8776
8777
0
    if (c > 0xFFFF)
8778
0
        return -1;
8779
0
    if (c == 0)
8780
0
        return 0;
8781
    /* level 1*/
8782
0
    i = map->level1[l1];
8783
0
    if (i == 0xFF) {
8784
0
        return -1;
8785
0
    }
8786
    /* level 2*/
8787
0
    i = map->level23[16*i+l2];
8788
0
    if (i == 0xFF) {
8789
0
        return -1;
8790
0
    }
8791
    /* level 3 */
8792
0
    i = map->level23[16*map->count2 + 128*i + l3];
8793
0
    if (i == 0) {
8794
0
        return -1;
8795
0
    }
8796
0
    return i;
8797
0
}
8798
8799
/* Lookup the character in the mapping.
8800
   On success, return PyLong, PyBytes or None (if the character can't be found).
8801
   If the result is PyLong, put its value in replace.
8802
   On error, return NULL.
8803
   */
8804
static PyObject *
8805
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8806
0
{
8807
0
    PyObject *w = PyLong_FromLong((long)c);
8808
0
    PyObject *x;
8809
8810
0
    if (w == NULL)
8811
0
        return NULL;
8812
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8813
0
    Py_DECREF(w);
8814
0
    if (rc == 0) {
8815
        /* No mapping found means: mapping is undefined. */
8816
0
        Py_RETURN_NONE;
8817
0
    }
8818
0
    if (x == NULL) {
8819
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8820
            /* No mapping found means: mapping is undefined. */
8821
0
            PyErr_Clear();
8822
0
            Py_RETURN_NONE;
8823
0
        } else
8824
0
            return NULL;
8825
0
    }
8826
0
    else if (x == Py_None)
8827
0
        return x;
8828
0
    else if (PyLong_Check(x)) {
8829
0
        long value = PyLong_AsLong(x);
8830
0
        if (value < 0 || value > 255) {
8831
0
            PyErr_SetString(PyExc_TypeError,
8832
0
                            "character mapping must be in range(256)");
8833
0
            Py_DECREF(x);
8834
0
            return NULL;
8835
0
        }
8836
0
        *replace = (unsigned char)value;
8837
0
        return x;
8838
0
    }
8839
0
    else if (PyBytes_Check(x))
8840
0
        return x;
8841
0
    else {
8842
        /* wrong return value */
8843
0
        PyErr_Format(PyExc_TypeError,
8844
0
                     "character mapping must return integer, bytes or None, not %.400s",
8845
0
                     Py_TYPE(x)->tp_name);
8846
0
        Py_DECREF(x);
8847
0
        return NULL;
8848
0
    }
8849
0
}
8850
8851
static int
8852
charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8853
0
{
8854
0
    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8855
    /* exponentially overallocate to minimize reallocations */
8856
0
    if (requiredsize < 2*outsize)
8857
0
        requiredsize = 2*outsize;
8858
0
    if (_PyBytes_Resize(outobj, requiredsize))
8859
0
        return -1;
8860
0
    return 0;
8861
0
}
8862
8863
typedef enum charmapencode_result {
8864
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8865
} charmapencode_result;
8866
/* lookup the character, put the result in the output string and adjust
8867
   various state variables. Resize the output bytes object if not enough
8868
   space is available. Return a new reference to the object that
8869
   was put in the output buffer, or Py_None, if the mapping was undefined
8870
   (in which case no character was written) or NULL, if a
8871
   reallocation error occurred. The caller must decref the result */
8872
static charmapencode_result
8873
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8874
                     PyObject **outobj, Py_ssize_t *outpos)
8875
0
{
8876
0
    PyObject *rep;
8877
0
    unsigned char replace;
8878
0
    char *outstart;
8879
0
    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8880
8881
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8882
0
        int res = encoding_map_lookup(c, mapping);
8883
0
        Py_ssize_t requiredsize = *outpos+1;
8884
0
        if (res == -1)
8885
0
            return enc_FAILED;
8886
0
        if (outsize<requiredsize)
8887
0
            if (charmapencode_resize(outobj, outpos, requiredsize))
8888
0
                return enc_EXCEPTION;
8889
0
        outstart = PyBytes_AS_STRING(*outobj);
8890
0
        outstart[(*outpos)++] = (char)res;
8891
0
        return enc_SUCCESS;
8892
0
    }
8893
8894
0
    rep = charmapencode_lookup(c, mapping, &replace);
8895
0
    if (rep==NULL)
8896
0
        return enc_EXCEPTION;
8897
0
    else if (rep==Py_None) {
8898
0
        Py_DECREF(rep);
8899
0
        return enc_FAILED;
8900
0
    } else {
8901
0
        if (PyLong_Check(rep)) {
8902
0
            Py_ssize_t requiredsize = *outpos+1;
8903
0
            if (outsize<requiredsize)
8904
0
                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8905
0
                    Py_DECREF(rep);
8906
0
                    return enc_EXCEPTION;
8907
0
                }
8908
0
            outstart = PyBytes_AS_STRING(*outobj);
8909
0
            outstart[(*outpos)++] = (char)replace;
8910
0
        }
8911
0
        else {
8912
0
            const char *repchars = PyBytes_AS_STRING(rep);
8913
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8914
0
            Py_ssize_t requiredsize = *outpos+repsize;
8915
0
            if (outsize<requiredsize)
8916
0
                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8917
0
                    Py_DECREF(rep);
8918
0
                    return enc_EXCEPTION;
8919
0
                }
8920
0
            outstart = PyBytes_AS_STRING(*outobj);
8921
0
            memcpy(outstart + *outpos, repchars, repsize);
8922
0
            *outpos += repsize;
8923
0
        }
8924
0
    }
8925
0
    Py_DECREF(rep);
8926
0
    return enc_SUCCESS;
8927
0
}
8928
8929
/* handle an error in PyUnicode_EncodeCharmap
8930
   Return 0 on success, -1 on error */
8931
static int
8932
charmap_encoding_error(
8933
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8934
    PyObject **exceptionObject,
8935
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8936
    PyObject **res, Py_ssize_t *respos)
8937
0
{
8938
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8939
0
    Py_ssize_t size, repsize;
8940
0
    Py_ssize_t newpos;
8941
0
    int kind;
8942
0
    const void *data;
8943
0
    Py_ssize_t index;
8944
    /* startpos for collecting unencodable chars */
8945
0
    Py_ssize_t collstartpos = *inpos;
8946
0
    Py_ssize_t collendpos = *inpos+1;
8947
0
    Py_ssize_t collpos;
8948
0
    const char *encoding = "charmap";
8949
0
    const char *reason = "character maps to <undefined>";
8950
0
    charmapencode_result x;
8951
0
    Py_UCS4 ch;
8952
0
    int val;
8953
8954
0
    size = PyUnicode_GET_LENGTH(unicode);
8955
    /* find all unencodable characters */
8956
0
    while (collendpos < size) {
8957
0
        PyObject *rep;
8958
0
        unsigned char replace;
8959
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8960
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8961
0
            val = encoding_map_lookup(ch, mapping);
8962
0
            if (val != -1)
8963
0
                break;
8964
0
            ++collendpos;
8965
0
            continue;
8966
0
        }
8967
8968
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8969
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8970
0
        if (rep==NULL)
8971
0
            return -1;
8972
0
        else if (rep!=Py_None) {
8973
0
            Py_DECREF(rep);
8974
0
            break;
8975
0
        }
8976
0
        Py_DECREF(rep);
8977
0
        ++collendpos;
8978
0
    }
8979
    /* cache callback name lookup
8980
     * (if not done yet, i.e. it's the first error) */
8981
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8982
0
        *error_handler = _Py_GetErrorHandler(errors);
8983
8984
0
    switch (*error_handler) {
8985
0
    case _Py_ERROR_STRICT:
8986
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8987
0
        return -1;
8988
8989
0
    case _Py_ERROR_REPLACE:
8990
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8991
0
            x = charmapencode_output('?', mapping, res, respos);
8992
0
            if (x==enc_EXCEPTION) {
8993
0
                return -1;
8994
0
            }
8995
0
            else if (x==enc_FAILED) {
8996
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8997
0
                return -1;
8998
0
            }
8999
0
        }
9000
0
        _Py_FALLTHROUGH;
9001
0
    case _Py_ERROR_IGNORE:
9002
0
        *inpos = collendpos;
9003
0
        break;
9004
9005
0
    case _Py_ERROR_XMLCHARREFREPLACE:
9006
        /* generate replacement (temporarily (mis)uses p) */
9007
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
9008
0
            char buffer[2+29+1+1];
9009
0
            char *cp;
9010
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
9011
0
            for (cp = buffer; *cp; ++cp) {
9012
0
                x = charmapencode_output(*cp, mapping, res, respos);
9013
0
                if (x==enc_EXCEPTION)
9014
0
                    return -1;
9015
0
                else if (x==enc_FAILED) {
9016
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
9017
0
                    return -1;
9018
0
                }
9019
0
            }
9020
0
        }
9021
0
        *inpos = collendpos;
9022
0
        break;
9023
9024
0
    default:
9025
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
9026
0
                                                      encoding, reason, unicode, exceptionObject,
9027
0
                                                      collstartpos, collendpos, &newpos);
9028
0
        if (repunicode == NULL)
9029
0
            return -1;
9030
0
        if (PyBytes_Check(repunicode)) {
9031
            /* Directly copy bytes result to output. */
9032
0
            Py_ssize_t outsize = PyBytes_Size(*res);
9033
0
            Py_ssize_t requiredsize;
9034
0
            repsize = PyBytes_Size(repunicode);
9035
0
            requiredsize = *respos + repsize;
9036
0
            if (requiredsize > outsize)
9037
                /* Make room for all additional bytes. */
9038
0
                if (charmapencode_resize(res, respos, requiredsize)) {
9039
0
                    Py_DECREF(repunicode);
9040
0
                    return -1;
9041
0
                }
9042
0
            memcpy(PyBytes_AsString(*res) + *respos,
9043
0
                   PyBytes_AsString(repunicode),  repsize);
9044
0
            *respos += repsize;
9045
0
            *inpos = newpos;
9046
0
            Py_DECREF(repunicode);
9047
0
            break;
9048
0
        }
9049
        /* generate replacement  */
9050
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
9051
0
        data = PyUnicode_DATA(repunicode);
9052
0
        kind = PyUnicode_KIND(repunicode);
9053
0
        for (index = 0; index < repsize; index++) {
9054
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
9055
0
            x = charmapencode_output(repch, mapping, res, respos);
9056
0
            if (x==enc_EXCEPTION) {
9057
0
                Py_DECREF(repunicode);
9058
0
                return -1;
9059
0
            }
9060
0
            else if (x==enc_FAILED) {
9061
0
                Py_DECREF(repunicode);
9062
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
9063
0
                return -1;
9064
0
            }
9065
0
        }
9066
0
        *inpos = newpos;
9067
0
        Py_DECREF(repunicode);
9068
0
    }
9069
0
    return 0;
9070
0
}
9071
9072
PyObject *
9073
_PyUnicode_EncodeCharmap(PyObject *unicode,
9074
                         PyObject *mapping,
9075
                         const char *errors)
9076
0
{
9077
    /* output object */
9078
0
    PyObject *res = NULL;
9079
    /* current input position */
9080
0
    Py_ssize_t inpos = 0;
9081
0
    Py_ssize_t size;
9082
    /* current output position */
9083
0
    Py_ssize_t respos = 0;
9084
0
    PyObject *error_handler_obj = NULL;
9085
0
    PyObject *exc = NULL;
9086
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
9087
0
    const void *data;
9088
0
    int kind;
9089
9090
0
    size = PyUnicode_GET_LENGTH(unicode);
9091
0
    data = PyUnicode_DATA(unicode);
9092
0
    kind = PyUnicode_KIND(unicode);
9093
9094
    /* Default to Latin-1 */
9095
0
    if (mapping == NULL)
9096
0
        return unicode_encode_ucs1(unicode, errors, 256);
9097
9098
    /* allocate enough for a simple encoding without
9099
       replacements, if we need more, we'll resize */
9100
0
    res = PyBytes_FromStringAndSize(NULL, size);
9101
0
    if (res == NULL)
9102
0
        goto onError;
9103
0
    if (size == 0)
9104
0
        return res;
9105
9106
0
    while (inpos<size) {
9107
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9108
        /* try to encode it */
9109
0
        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
9110
0
        if (x==enc_EXCEPTION) /* error */
9111
0
            goto onError;
9112
0
        if (x==enc_FAILED) { /* unencodable character */
9113
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
9114
0
                                       &exc,
9115
0
                                       &error_handler, &error_handler_obj, errors,
9116
0
                                       &res, &respos)) {
9117
0
                goto onError;
9118
0
            }
9119
0
        }
9120
0
        else
9121
            /* done with this character => adjust input position */
9122
0
            ++inpos;
9123
0
    }
9124
9125
    /* Resize if we allocated to much */
9126
0
    if (respos<PyBytes_GET_SIZE(res))
9127
0
        if (_PyBytes_Resize(&res, respos) < 0)
9128
0
            goto onError;
9129
9130
0
    Py_XDECREF(exc);
9131
0
    Py_XDECREF(error_handler_obj);
9132
0
    return res;
9133
9134
0
  onError:
9135
0
    Py_XDECREF(res);
9136
0
    Py_XDECREF(exc);
9137
0
    Py_XDECREF(error_handler_obj);
9138
0
    return NULL;
9139
0
}
9140
9141
PyObject *
9142
PyUnicode_AsCharmapString(PyObject *unicode,
9143
                          PyObject *mapping)
9144
0
{
9145
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9146
0
        PyErr_BadArgument();
9147
0
        return NULL;
9148
0
    }
9149
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9150
0
}
9151
9152
/* create or adjust a UnicodeTranslateError */
9153
static void
9154
make_translate_exception(PyObject **exceptionObject,
9155
                         PyObject *unicode,
9156
                         Py_ssize_t startpos, Py_ssize_t endpos,
9157
                         const char *reason)
9158
0
{
9159
0
    if (*exceptionObject == NULL) {
9160
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9161
0
            unicode, startpos, endpos, reason);
9162
0
    }
9163
0
    else {
9164
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9165
0
            goto onError;
9166
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9167
0
            goto onError;
9168
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9169
0
            goto onError;
9170
0
        return;
9171
0
      onError:
9172
0
        Py_CLEAR(*exceptionObject);
9173
0
    }
9174
0
}
9175
9176
/* error handling callback helper:
9177
   build arguments, call the callback and check the arguments,
9178
   put the result into newpos and return the replacement string, which
9179
   has to be freed by the caller */
9180
static PyObject *
9181
unicode_translate_call_errorhandler(const char *errors,
9182
                                    PyObject **errorHandler,
9183
                                    const char *reason,
9184
                                    PyObject *unicode, PyObject **exceptionObject,
9185
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9186
                                    Py_ssize_t *newpos)
9187
0
{
9188
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9189
9190
0
    Py_ssize_t i_newpos;
9191
0
    PyObject *restuple;
9192
0
    PyObject *resunicode;
9193
9194
0
    if (*errorHandler == NULL) {
9195
0
        *errorHandler = PyCodec_LookupError(errors);
9196
0
        if (*errorHandler == NULL)
9197
0
            return NULL;
9198
0
    }
9199
9200
0
    make_translate_exception(exceptionObject,
9201
0
                             unicode, startpos, endpos, reason);
9202
0
    if (*exceptionObject == NULL)
9203
0
        return NULL;
9204
9205
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9206
0
    if (restuple == NULL)
9207
0
        return NULL;
9208
0
    if (!PyTuple_Check(restuple)) {
9209
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9210
0
        Py_DECREF(restuple);
9211
0
        return NULL;
9212
0
    }
9213
0
    if (!PyArg_ParseTuple(restuple, argparse,
9214
0
                          &resunicode, &i_newpos)) {
9215
0
        Py_DECREF(restuple);
9216
0
        return NULL;
9217
0
    }
9218
0
    if (i_newpos<0)
9219
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9220
0
    else
9221
0
        *newpos = i_newpos;
9222
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9223
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9224
0
        Py_DECREF(restuple);
9225
0
        return NULL;
9226
0
    }
9227
0
    Py_INCREF(resunicode);
9228
0
    Py_DECREF(restuple);
9229
0
    return resunicode;
9230
0
}
9231
9232
/* Lookup the character ch in the mapping and put the result in result,
9233
   which must be decrefed by the caller.
9234
   The result can be PyLong, PyUnicode, None or NULL.
9235
   If the result is PyLong, put its value in replace.
9236
   Return 0 on success, -1 on error */
9237
static int
9238
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9239
338
{
9240
338
    PyObject *w = PyLong_FromLong((long)c);
9241
338
    PyObject *x;
9242
9243
338
    if (w == NULL)
9244
0
        return -1;
9245
338
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9246
338
    Py_DECREF(w);
9247
338
    if (rc == 0) {
9248
        /* No mapping found means: use 1:1 mapping. */
9249
158
        *result = NULL;
9250
158
        return 0;
9251
158
    }
9252
180
    if (x == NULL) {
9253
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9254
            /* No mapping found means: use 1:1 mapping. */
9255
0
            PyErr_Clear();
9256
0
            *result = NULL;
9257
0
            return 0;
9258
0
        } else
9259
0
            return -1;
9260
0
    }
9261
180
    else if (x == Py_None) {
9262
0
        *result = x;
9263
0
        return 0;
9264
0
    }
9265
180
    else if (PyLong_Check(x)) {
9266
0
        long value = PyLong_AsLong(x);
9267
0
        if (value < 0 || value > MAX_UNICODE) {
9268
0
            PyErr_Format(PyExc_ValueError,
9269
0
                         "character mapping must be in range(0x%x)",
9270
0
                         MAX_UNICODE+1);
9271
0
            Py_DECREF(x);
9272
0
            return -1;
9273
0
        }
9274
0
        *result = x;
9275
0
        *replace = (Py_UCS4)value;
9276
0
        return 0;
9277
0
    }
9278
180
    else if (PyUnicode_Check(x)) {
9279
180
        *result = x;
9280
180
        return 0;
9281
180
    }
9282
0
    else {
9283
        /* wrong return value */
9284
0
        PyErr_SetString(PyExc_TypeError,
9285
0
                        "character mapping must return integer, None or str");
9286
0
        Py_DECREF(x);
9287
0
        return -1;
9288
0
    }
9289
180
}
9290
9291
/* lookup the character, write the result into the writer.
9292
   Return 1 if the result was written into the writer, return 0 if the mapping
9293
   was undefined, raise an exception return -1 on error. */
9294
static int
9295
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9296
                        _PyUnicodeWriter *writer)
9297
212
{
9298
212
    PyObject *item;
9299
212
    Py_UCS4 replace;
9300
9301
212
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9302
0
        return -1;
9303
9304
212
    if (item == NULL) {
9305
        /* not found => default to 1:1 mapping */
9306
88
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9307
0
            return -1;
9308
0
        }
9309
88
        return 1;
9310
88
    }
9311
9312
124
    if (item == Py_None) {
9313
0
        Py_DECREF(item);
9314
0
        return 0;
9315
0
    }
9316
9317
124
    if (PyLong_Check(item)) {
9318
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9319
0
            Py_DECREF(item);
9320
0
            return -1;
9321
0
        }
9322
0
        Py_DECREF(item);
9323
0
        return 1;
9324
0
    }
9325
9326
124
    if (!PyUnicode_Check(item)) {
9327
0
        Py_DECREF(item);
9328
0
        return -1;
9329
0
    }
9330
9331
124
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9332
0
        Py_DECREF(item);
9333
0
        return -1;
9334
0
    }
9335
9336
124
    Py_DECREF(item);
9337
124
    return 1;
9338
124
}
9339
9340
static int
9341
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9342
                              Py_UCS1 *translate)
9343
126
{
9344
126
    PyObject *item = NULL;
9345
126
    Py_UCS4 replace;
9346
126
    int ret = 0;
9347
9348
126
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9349
0
        return -1;
9350
0
    }
9351
9352
126
    if (item == Py_None) {
9353
        /* deletion */
9354
0
        translate[ch] = 0xfe;
9355
0
    }
9356
126
    else if (item == NULL) {
9357
        /* not found => default to 1:1 mapping */
9358
70
        translate[ch] = ch;
9359
70
        return 1;
9360
70
    }
9361
56
    else if (PyLong_Check(item)) {
9362
0
        if (replace > 127) {
9363
            /* invalid character or character outside ASCII:
9364
               skip the fast translate */
9365
0
            goto exit;
9366
0
        }
9367
0
        translate[ch] = (Py_UCS1)replace;
9368
0
    }
9369
56
    else if (PyUnicode_Check(item)) {
9370
56
        if (PyUnicode_GET_LENGTH(item) != 1)
9371
56
            goto exit;
9372
9373
0
        replace = PyUnicode_READ_CHAR(item, 0);
9374
0
        if (replace > 127)
9375
0
            goto exit;
9376
0
        translate[ch] = (Py_UCS1)replace;
9377
0
    }
9378
0
    else {
9379
        /* not None, NULL, long or unicode */
9380
0
        goto exit;
9381
0
    }
9382
0
    ret = 1;
9383
9384
56
  exit:
9385
56
    Py_DECREF(item);
9386
56
    return ret;
9387
0
}
9388
9389
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9390
   was translated into writer, return 0 if the input string was partially
9391
   translated into writer, raise an exception and return -1 on error. */
9392
static int
9393
unicode_fast_translate(PyObject *input, PyObject *mapping,
9394
                       _PyUnicodeWriter *writer, int ignore,
9395
                       Py_ssize_t *input_pos)
9396
104
{
9397
104
    Py_UCS1 ascii_table[128], ch, ch2;
9398
104
    Py_ssize_t len;
9399
104
    const Py_UCS1 *in, *end;
9400
104
    Py_UCS1 *out;
9401
104
    int res = 0;
9402
9403
104
    len = PyUnicode_GET_LENGTH(input);
9404
9405
104
    memset(ascii_table, 0xff, 128);
9406
9407
104
    in = PyUnicode_1BYTE_DATA(input);
9408
104
    end = in + len;
9409
9410
104
    assert(PyUnicode_IS_ASCII(writer->buffer));
9411
104
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9412
104
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9413
9414
188
    for (; in < end; in++) {
9415
140
        ch = *in;
9416
140
        ch2 = ascii_table[ch];
9417
140
        if (ch2 == 0xff) {
9418
126
            int translate = unicode_fast_translate_lookup(mapping, ch,
9419
126
                                                          ascii_table);
9420
126
            if (translate < 0)
9421
0
                return -1;
9422
126
            if (translate == 0)
9423
56
                goto exit;
9424
70
            ch2 = ascii_table[ch];
9425
70
        }
9426
84
        if (ch2 == 0xfe) {
9427
0
            if (ignore)
9428
0
                continue;
9429
0
            goto exit;
9430
0
        }
9431
84
        assert(ch2 < 128);
9432
84
        *out = ch2;
9433
84
        out++;
9434
84
    }
9435
48
    res = 1;
9436
9437
104
exit:
9438
104
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9439
104
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9440
104
    return res;
9441
48
}
9442
9443
static PyObject *
9444
_PyUnicode_TranslateCharmap(PyObject *input,
9445
                            PyObject *mapping,
9446
                            const char *errors)
9447
104
{
9448
    /* input object */
9449
104
    const void *data;
9450
104
    Py_ssize_t size, i;
9451
104
    int kind;
9452
    /* output buffer */
9453
104
    _PyUnicodeWriter writer;
9454
    /* error handler */
9455
104
    const char *reason = "character maps to <undefined>";
9456
104
    PyObject *errorHandler = NULL;
9457
104
    PyObject *exc = NULL;
9458
104
    int ignore;
9459
104
    int res;
9460
9461
104
    if (mapping == NULL) {
9462
0
        PyErr_BadArgument();
9463
0
        return NULL;
9464
0
    }
9465
9466
104
    data = PyUnicode_DATA(input);
9467
104
    kind = PyUnicode_KIND(input);
9468
104
    size = PyUnicode_GET_LENGTH(input);
9469
9470
104
    if (size == 0)
9471
0
        return PyUnicode_FromObject(input);
9472
9473
    /* allocate enough for a simple 1:1 translation without
9474
       replacements, if we need more, we'll resize */
9475
104
    _PyUnicodeWriter_Init(&writer);
9476
104
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9477
0
        goto onError;
9478
9479
104
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9480
9481
104
    if (PyUnicode_IS_ASCII(input)) {
9482
104
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9483
104
        if (res < 0) {
9484
0
            _PyUnicodeWriter_Dealloc(&writer);
9485
0
            return NULL;
9486
0
        }
9487
104
        if (res == 1)
9488
48
            return _PyUnicodeWriter_Finish(&writer);
9489
104
    }
9490
0
    else {
9491
0
        i = 0;
9492
0
    }
9493
9494
268
    while (i<size) {
9495
        /* try to encode it */
9496
212
        int translate;
9497
212
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9498
212
        Py_ssize_t newpos;
9499
        /* startpos for collecting untranslatable chars */
9500
212
        Py_ssize_t collstart;
9501
212
        Py_ssize_t collend;
9502
212
        Py_UCS4 ch;
9503
9504
212
        ch = PyUnicode_READ(kind, data, i);
9505
212
        translate = charmaptranslate_output(ch, mapping, &writer);
9506
212
        if (translate < 0)
9507
0
            goto onError;
9508
9509
212
        if (translate != 0) {
9510
            /* it worked => adjust input pointer */
9511
212
            ++i;
9512
212
            continue;
9513
212
        }
9514
9515
        /* untranslatable character */
9516
0
        collstart = i;
9517
0
        collend = i+1;
9518
9519
        /* find all untranslatable characters */
9520
0
        while (collend < size) {
9521
0
            PyObject *x;
9522
0
            Py_UCS4 replace;
9523
0
            ch = PyUnicode_READ(kind, data, collend);
9524
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9525
0
                goto onError;
9526
0
            Py_XDECREF(x);
9527
0
            if (x != Py_None)
9528
0
                break;
9529
0
            ++collend;
9530
0
        }
9531
9532
0
        if (ignore) {
9533
0
            i = collend;
9534
0
        }
9535
0
        else {
9536
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9537
0
                                                             reason, input, &exc,
9538
0
                                                             collstart, collend, &newpos);
9539
0
            if (repunicode == NULL)
9540
0
                goto onError;
9541
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9542
0
                Py_DECREF(repunicode);
9543
0
                goto onError;
9544
0
            }
9545
0
            Py_DECREF(repunicode);
9546
0
            i = newpos;
9547
0
        }
9548
0
    }
9549
56
    Py_XDECREF(exc);
9550
56
    Py_XDECREF(errorHandler);
9551
56
    return _PyUnicodeWriter_Finish(&writer);
9552
9553
0
  onError:
9554
0
    _PyUnicodeWriter_Dealloc(&writer);
9555
0
    Py_XDECREF(exc);
9556
0
    Py_XDECREF(errorHandler);
9557
0
    return NULL;
9558
56
}
9559
9560
PyObject *
9561
PyUnicode_Translate(PyObject *str,
9562
                    PyObject *mapping,
9563
                    const char *errors)
9564
0
{
9565
0
    if (ensure_unicode(str) < 0)
9566
0
        return NULL;
9567
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9568
0
}
9569
9570
PyObject *
9571
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9572
4.09M
{
9573
4.09M
    if (!PyUnicode_Check(unicode)) {
9574
0
        PyErr_BadInternalCall();
9575
0
        return NULL;
9576
0
    }
9577
4.09M
    if (PyUnicode_IS_ASCII(unicode)) {
9578
        /* If the string is already ASCII, just return the same string */
9579
4.09M
        return Py_NewRef(unicode);
9580
4.09M
    }
9581
9582
2.20k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9583
2.20k
    PyObject *result = PyUnicode_New(len, 127);
9584
2.20k
    if (result == NULL) {
9585
0
        return NULL;
9586
0
    }
9587
9588
2.20k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9589
2.20k
    int kind = PyUnicode_KIND(unicode);
9590
2.20k
    const void *data = PyUnicode_DATA(unicode);
9591
2.20k
    Py_ssize_t i;
9592
31.2k
    for (i = 0; i < len; ++i) {
9593
29.2k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9594
29.2k
        if (ch < 127) {
9595
26.5k
            out[i] = ch;
9596
26.5k
        }
9597
2.66k
        else if (Py_UNICODE_ISSPACE(ch)) {
9598
1.08k
            out[i] = ' ';
9599
1.08k
        }
9600
1.58k
        else {
9601
1.58k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9602
1.58k
            if (decimal < 0) {
9603
143
                out[i] = '?';
9604
143
                out[i+1] = '\0';
9605
143
                _PyUnicode_LENGTH(result) = i + 1;
9606
143
                break;
9607
143
            }
9608
1.43k
            out[i] = '0' + decimal;
9609
1.43k
        }
9610
29.2k
    }
9611
9612
2.20k
    assert(_PyUnicode_CheckConsistency(result, 1));
9613
2.20k
    return result;
9614
2.20k
}
9615
9616
/* --- Helpers ------------------------------------------------------------ */
9617
9618
/* helper macro to fixup start/end slice values */
9619
#define ADJUST_INDICES(start, end, len) \
9620
136M
    do {                                \
9621
136M
        if (end > len) {                \
9622
121M
            end = len;                  \
9623
121M
        }                               \
9624
136M
        else if (end < 0) {             \
9625
0
            end += len;                 \
9626
0
            if (end < 0) {              \
9627
0
                end = 0;                \
9628
0
            }                           \
9629
0
        }                               \
9630
136M
        if (start < 0) {                \
9631
0
            start += len;               \
9632
0
            if (start < 0) {            \
9633
0
                start = 0;              \
9634
0
            }                           \
9635
0
        }                               \
9636
136M
    } while (0)
9637
9638
static Py_ssize_t
9639
any_find_slice(PyObject* s1, PyObject* s2,
9640
               Py_ssize_t start,
9641
               Py_ssize_t end,
9642
               int direction)
9643
17.6M
{
9644
17.6M
    int kind1, kind2;
9645
17.6M
    const void *buf1, *buf2;
9646
17.6M
    Py_ssize_t len1, len2, result;
9647
9648
17.6M
    kind1 = PyUnicode_KIND(s1);
9649
17.6M
    kind2 = PyUnicode_KIND(s2);
9650
17.6M
    if (kind1 < kind2)
9651
0
        return -1;
9652
9653
17.6M
    len1 = PyUnicode_GET_LENGTH(s1);
9654
17.6M
    len2 = PyUnicode_GET_LENGTH(s2);
9655
17.6M
    ADJUST_INDICES(start, end, len1);
9656
17.6M
    if (end - start < len2)
9657
22.6k
        return -1;
9658
9659
17.6M
    buf1 = PyUnicode_DATA(s1);
9660
17.6M
    buf2 = PyUnicode_DATA(s2);
9661
17.6M
    if (len2 == 1) {
9662
17.5M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9663
17.5M
        result = findchar((const char *)buf1 + kind1*start,
9664
17.5M
                          kind1, end - start, ch, direction);
9665
17.5M
        if (result == -1)
9666
216k
            return -1;
9667
17.3M
        else
9668
17.3M
            return start + result;
9669
17.5M
    }
9670
9671
28.3k
    if (kind2 != kind1) {
9672
19.6k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9673
19.6k
        if (!buf2)
9674
0
            return -2;
9675
19.6k
    }
9676
9677
28.3k
    if (direction > 0) {
9678
28.3k
        switch (kind1) {
9679
8.72k
        case PyUnicode_1BYTE_KIND:
9680
8.72k
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9681
5.52k
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9682
3.20k
            else
9683
3.20k
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9684
8.72k
            break;
9685
15.5k
        case PyUnicode_2BYTE_KIND:
9686
15.5k
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9687
15.5k
            break;
9688
4.07k
        case PyUnicode_4BYTE_KIND:
9689
4.07k
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9690
4.07k
            break;
9691
0
        default:
9692
0
            Py_UNREACHABLE();
9693
28.3k
        }
9694
28.3k
    }
9695
0
    else {
9696
0
        switch (kind1) {
9697
0
        case PyUnicode_1BYTE_KIND:
9698
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9699
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9700
0
            else
9701
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9702
0
            break;
9703
0
        case PyUnicode_2BYTE_KIND:
9704
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9705
0
            break;
9706
0
        case PyUnicode_4BYTE_KIND:
9707
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9708
0
            break;
9709
0
        default:
9710
0
            Py_UNREACHABLE();
9711
0
        }
9712
0
    }
9713
9714
28.3k
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9715
28.3k
    if (kind2 != kind1)
9716
19.6k
        PyMem_Free((void *)buf2);
9717
9718
28.3k
    return result;
9719
28.3k
}
9720
9721
/* _PyUnicode_InsertThousandsGrouping() helper functions */
9722
#include "stringlib/localeutil.h"
9723
9724
/**
9725
 * InsertThousandsGrouping:
9726
 * @writer: Unicode writer.
9727
 * @n_buffer: Number of characters in @buffer.
9728
 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9729
 * @d_pos: Start of digits string.
9730
 * @n_digits: The number of digits in the string, in which we want
9731
 *            to put the grouping chars.
9732
 * @min_width: The minimum width of the digits in the output string.
9733
 *             Output will be zero-padded on the left to fill.
9734
 * @grouping: see definition in localeconv().
9735
 * @thousands_sep: see definition in localeconv().
9736
 *
9737
 * There are 2 modes: counting and filling. If @writer is NULL,
9738
 *  we are in counting mode, else filling mode.
9739
 * If counting, the required buffer size is returned.
9740
 * If filling, we know the buffer will be large enough, so we don't
9741
 *  need to pass in the buffer size.
9742
 * Inserts thousand grouping characters (as defined by grouping and
9743
 *  thousands_sep) into @writer.
9744
 *
9745
 * Return value: -1 on error, number of characters otherwise.
9746
 **/
9747
Py_ssize_t
9748
_PyUnicode_InsertThousandsGrouping(
9749
    _PyUnicodeWriter *writer,
9750
    Py_ssize_t n_buffer,
9751
    PyObject *digits,
9752
    Py_ssize_t d_pos,
9753
    Py_ssize_t n_digits,
9754
    Py_ssize_t min_width,
9755
    const char *grouping,
9756
    PyObject *thousands_sep,
9757
    Py_UCS4 *maxchar,
9758
    int forward)
9759
128
{
9760
128
    min_width = Py_MAX(0, min_width);
9761
128
    if (writer) {
9762
64
        assert(digits != NULL);
9763
64
        assert(maxchar == NULL);
9764
64
    }
9765
64
    else {
9766
64
        assert(digits == NULL);
9767
64
        assert(maxchar != NULL);
9768
64
    }
9769
128
    assert(0 <= d_pos);
9770
128
    assert(0 <= n_digits);
9771
128
    assert(grouping != NULL);
9772
9773
128
    Py_ssize_t count = 0;
9774
128
    Py_ssize_t n_zeros;
9775
128
    int loop_broken = 0;
9776
128
    int use_separator = 0; /* First time through, don't append the
9777
                              separator. They only go between
9778
                              groups. */
9779
128
    Py_ssize_t buffer_pos;
9780
128
    Py_ssize_t digits_pos;
9781
128
    Py_ssize_t len;
9782
128
    Py_ssize_t n_chars;
9783
128
    Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9784
                                        be looked at */
9785
    /* A generator that returns all of the grouping widths, until it
9786
       returns 0. */
9787
128
    GroupGenerator groupgen;
9788
128
    GroupGenerator_init(&groupgen, grouping);
9789
128
    const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9790
9791
    /* if digits are not grouped, thousands separator
9792
       should be an empty string */
9793
128
    assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9794
9795
128
    digits_pos = d_pos + (forward ? 0 : n_digits);
9796
128
    if (writer) {
9797
64
        buffer_pos = writer->pos + (forward ? 0 : n_buffer);
9798
64
        assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9799
64
        assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9800
64
    }
9801
64
    else {
9802
64
        buffer_pos = forward ? 0 : n_buffer;
9803
64
    }
9804
9805
128
    if (!writer) {
9806
64
        *maxchar = 127;
9807
64
    }
9808
9809
128
    while ((len = GroupGenerator_next(&groupgen)) > 0) {
9810
0
        len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9811
0
        n_zeros = Py_MAX(0, len - remaining);
9812
0
        n_chars = Py_MAX(0, Py_MIN(remaining, len));
9813
9814
        /* Use n_zero zero's and n_chars chars */
9815
9816
        /* Count only, don't do anything. */
9817
0
        count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9818
9819
        /* Copy into the writer. */
9820
0
        InsertThousandsGrouping_fill(writer, &buffer_pos,
9821
0
                                     digits, &digits_pos,
9822
0
                                     n_chars, n_zeros,
9823
0
                                     use_separator ? thousands_sep : NULL,
9824
0
                                     thousands_sep_len, maxchar, forward);
9825
9826
        /* Use a separator next time. */
9827
0
        use_separator = 1;
9828
9829
0
        remaining -= n_chars;
9830
0
        min_width -= len;
9831
9832
0
        if (remaining <= 0 && min_width <= 0) {
9833
0
            loop_broken = 1;
9834
0
            break;
9835
0
        }
9836
0
        min_width -= thousands_sep_len;
9837
0
    }
9838
128
    if (!loop_broken) {
9839
        /* We left the loop without using a break statement. */
9840
9841
128
        len = Py_MAX(Py_MAX(remaining, min_width), 1);
9842
128
        n_zeros = Py_MAX(0, len - remaining);
9843
128
        n_chars = Py_MAX(0, Py_MIN(remaining, len));
9844
9845
        /* Use n_zero zero's and n_chars chars */
9846
128
        count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9847
9848
        /* Copy into the writer. */
9849
128
        InsertThousandsGrouping_fill(writer, &buffer_pos,
9850
128
                                     digits, &digits_pos,
9851
128
                                     n_chars, n_zeros,
9852
128
                                     use_separator ? thousands_sep : NULL,
9853
128
                                     thousands_sep_len, maxchar, forward);
9854
128
    }
9855
128
    return count;
9856
128
}
9857
9858
Py_ssize_t
9859
PyUnicode_Count(PyObject *str,
9860
                PyObject *substr,
9861
                Py_ssize_t start,
9862
                Py_ssize_t end)
9863
0
{
9864
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9865
0
        return -1;
9866
9867
0
    return unicode_count_impl(str, substr, start, end);
9868
0
}
9869
9870
Py_ssize_t
9871
PyUnicode_Find(PyObject *str,
9872
               PyObject *substr,
9873
               Py_ssize_t start,
9874
               Py_ssize_t end,
9875
               int direction)
9876
0
{
9877
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9878
0
        return -2;
9879
9880
0
    return any_find_slice(str, substr, start, end, direction);
9881
0
}
9882
9883
Py_ssize_t
9884
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9885
                   Py_ssize_t start, Py_ssize_t end,
9886
                   int direction)
9887
520k
{
9888
520k
    int kind;
9889
520k
    Py_ssize_t len, result;
9890
520k
    len = PyUnicode_GET_LENGTH(str);
9891
520k
    ADJUST_INDICES(start, end, len);
9892
520k
    if (end - start < 1)
9893
0
        return -1;
9894
520k
    kind = PyUnicode_KIND(str);
9895
520k
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9896
520k
                      kind, end-start, ch, direction);
9897
520k
    if (result == -1)
9898
51.7k
        return -1;
9899
468k
    else
9900
468k
        return start + result;
9901
520k
}
9902
9903
static int
9904
tailmatch(PyObject *self,
9905
          PyObject *substring,
9906
          Py_ssize_t start,
9907
          Py_ssize_t end,
9908
          int direction)
9909
98.1M
{
9910
98.1M
    int kind_self;
9911
98.1M
    int kind_sub;
9912
98.1M
    const void *data_self;
9913
98.1M
    const void *data_sub;
9914
98.1M
    Py_ssize_t offset;
9915
98.1M
    Py_ssize_t i;
9916
98.1M
    Py_ssize_t end_sub;
9917
9918
98.1M
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9919
98.1M
    end -= PyUnicode_GET_LENGTH(substring);
9920
98.1M
    if (end < start)
9921
12.8M
        return 0;
9922
9923
85.2M
    if (PyUnicode_GET_LENGTH(substring) == 0)
9924
0
        return 1;
9925
9926
85.2M
    kind_self = PyUnicode_KIND(self);
9927
85.2M
    data_self = PyUnicode_DATA(self);
9928
85.2M
    kind_sub = PyUnicode_KIND(substring);
9929
85.2M
    data_sub = PyUnicode_DATA(substring);
9930
85.2M
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9931
9932
85.2M
    if (direction > 0)
9933
7.62M
        offset = end;
9934
77.6M
    else
9935
77.6M
        offset = start;
9936
9937
85.2M
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9938
85.2M
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9939
85.2M
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9940
43.0M
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9941
        /* If both are of the same kind, memcmp is sufficient */
9942
13.7M
        if (kind_self == kind_sub) {
9943
9.06M
            return ! memcmp((char *)data_self +
9944
9.06M
                                (offset * PyUnicode_KIND(substring)),
9945
9.06M
                            data_sub,
9946
9.06M
                            PyUnicode_GET_LENGTH(substring) *
9947
9.06M
                                PyUnicode_KIND(substring));
9948
9.06M
        }
9949
        /* otherwise we have to compare each character by first accessing it */
9950
4.65M
        else {
9951
            /* We do not need to compare 0 and len(substring)-1 because
9952
               the if statement above ensured already that they are equal
9953
               when we end up here. */
9954
4.71M
            for (i = 1; i < end_sub; ++i) {
9955
57.3k
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9956
57.3k
                    PyUnicode_READ(kind_sub, data_sub, i))
9957
4.57k
                    return 0;
9958
57.3k
            }
9959
4.65M
            return 1;
9960
4.65M
        }
9961
13.7M
    }
9962
9963
71.5M
    return 0;
9964
85.2M
}
9965
9966
Py_ssize_t
9967
PyUnicode_Tailmatch(PyObject *str,
9968
                    PyObject *substr,
9969
                    Py_ssize_t start,
9970
                    Py_ssize_t end,
9971
                    int direction)
9972
0
{
9973
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9974
0
        return -1;
9975
9976
0
    return tailmatch(str, substr, start, end, direction);
9977
0
}
9978
9979
static PyObject *
9980
ascii_upper_or_lower(PyObject *self, int lower)
9981
77.4M
{
9982
77.4M
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9983
77.4M
    const char *data = PyUnicode_DATA(self);
9984
77.4M
    char *resdata;
9985
77.4M
    PyObject *res;
9986
9987
77.4M
    res = PyUnicode_New(len, 127);
9988
77.4M
    if (res == NULL)
9989
0
        return NULL;
9990
77.4M
    resdata = PyUnicode_DATA(res);
9991
77.4M
    if (lower)
9992
77.4M
        _Py_bytes_lower(resdata, data, len);
9993
102
    else
9994
102
        _Py_bytes_upper(resdata, data, len);
9995
77.4M
    return res;
9996
77.4M
}
9997
9998
static Py_UCS4
9999
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
10000
38.9k
{
10001
38.9k
    Py_ssize_t j;
10002
38.9k
    int final_sigma;
10003
38.9k
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
10004
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
10005
10006
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10007
10008
    where ! is a negation and \p{xxx} is a character with property xxx.
10009
    */
10010
86.7k
    for (j = i - 1; j >= 0; j--) {
10011
84.4k
        c = PyUnicode_READ(kind, data, j);
10012
84.4k
        if (!_PyUnicode_IsCaseIgnorable(c))
10013
36.6k
            break;
10014
84.4k
    }
10015
38.9k
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10016
38.9k
    if (final_sigma) {
10017
64.5k
        for (j = i + 1; j < length; j++) {
10018
61.9k
            c = PyUnicode_READ(kind, data, j);
10019
61.9k
            if (!_PyUnicode_IsCaseIgnorable(c))
10020
24.1k
                break;
10021
61.9k
        }
10022
26.7k
        final_sigma = j == length || !_PyUnicode_IsCased(c);
10023
26.7k
    }
10024
38.9k
    return (final_sigma) ? 0x3C2 : 0x3C3;
10025
38.9k
}
10026
10027
static int
10028
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
10029
           Py_UCS4 c, Py_UCS4 *mapped)
10030
104M
{
10031
    /* Obscure special case. */
10032
104M
    if (c == 0x3A3) {
10033
38.9k
        mapped[0] = handle_capital_sigma(kind, data, length, i);
10034
38.9k
        return 1;
10035
38.9k
    }
10036
104M
    return _PyUnicode_ToLowerFull(c, mapped);
10037
104M
}
10038
10039
static Py_ssize_t
10040
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10041
0
{
10042
0
    Py_ssize_t i, k = 0;
10043
0
    int n_res, j;
10044
0
    Py_UCS4 c, mapped[3];
10045
10046
0
    c = PyUnicode_READ(kind, data, 0);
10047
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
10048
0
    for (j = 0; j < n_res; j++) {
10049
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
10050
0
        res[k++] = mapped[j];
10051
0
    }
10052
0
    for (i = 1; i < length; i++) {
10053
0
        c = PyUnicode_READ(kind, data, i);
10054
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
10055
0
        for (j = 0; j < n_res; j++) {
10056
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
10057
0
            res[k++] = mapped[j];
10058
0
        }
10059
0
    }
10060
0
    return k;
10061
0
}
10062
10063
static Py_ssize_t
10064
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
10065
0
    Py_ssize_t i, k = 0;
10066
10067
0
    for (i = 0; i < length; i++) {
10068
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10069
0
        int n_res, j;
10070
0
        if (Py_UNICODE_ISUPPER(c)) {
10071
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
10072
0
        }
10073
0
        else if (Py_UNICODE_ISLOWER(c)) {
10074
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
10075
0
        }
10076
0
        else {
10077
0
            n_res = 1;
10078
0
            mapped[0] = c;
10079
0
        }
10080
0
        for (j = 0; j < n_res; j++) {
10081
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
10082
0
            res[k++] = mapped[j];
10083
0
        }
10084
0
    }
10085
0
    return k;
10086
0
}
10087
10088
static Py_ssize_t
10089
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
10090
                  Py_UCS4 *maxchar, int lower)
10091
32.8M
{
10092
32.8M
    Py_ssize_t i, k = 0;
10093
10094
137M
    for (i = 0; i < length; i++) {
10095
104M
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10096
104M
        int n_res, j;
10097
104M
        if (lower)
10098
104M
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
10099
0
        else
10100
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
10101
209M
        for (j = 0; j < n_res; j++) {
10102
104M
            *maxchar = Py_MAX(*maxchar, mapped[j]);
10103
104M
            res[k++] = mapped[j];
10104
104M
        }
10105
104M
    }
10106
32.8M
    return k;
10107
32.8M
}
10108
10109
static Py_ssize_t
10110
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10111
0
{
10112
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10113
0
}
10114
10115
static Py_ssize_t
10116
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10117
32.8M
{
10118
32.8M
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10119
32.8M
}
10120
10121
static Py_ssize_t
10122
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10123
0
{
10124
0
    Py_ssize_t i, k = 0;
10125
10126
0
    for (i = 0; i < length; i++) {
10127
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
10128
0
        Py_UCS4 mapped[3];
10129
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10130
0
        for (j = 0; j < n_res; j++) {
10131
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
10132
0
            res[k++] = mapped[j];
10133
0
        }
10134
0
    }
10135
0
    return k;
10136
0
}
10137
10138
static Py_ssize_t
10139
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10140
0
{
10141
0
    Py_ssize_t i, k = 0;
10142
0
    int previous_is_cased;
10143
10144
0
    previous_is_cased = 0;
10145
0
    for (i = 0; i < length; i++) {
10146
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10147
0
        Py_UCS4 mapped[3];
10148
0
        int n_res, j;
10149
10150
0
        if (previous_is_cased)
10151
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
10152
0
        else
10153
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
10154
10155
0
        for (j = 0; j < n_res; j++) {
10156
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
10157
0
            res[k++] = mapped[j];
10158
0
        }
10159
10160
0
        previous_is_cased = _PyUnicode_IsCased(c);
10161
0
    }
10162
0
    return k;
10163
0
}
10164
10165
static PyObject *
10166
case_operation(PyObject *self,
10167
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
10168
32.8M
{
10169
32.8M
    PyObject *res = NULL;
10170
32.8M
    Py_ssize_t length, newlength = 0;
10171
32.8M
    int kind, outkind;
10172
32.8M
    const void *data;
10173
32.8M
    void *outdata;
10174
32.8M
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
10175
10176
32.8M
    kind = PyUnicode_KIND(self);
10177
32.8M
    data = PyUnicode_DATA(self);
10178
32.8M
    length = PyUnicode_GET_LENGTH(self);
10179
32.8M
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
10180
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
10181
0
        return NULL;
10182
0
    }
10183
32.8M
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
10184
32.8M
    if (tmp == NULL)
10185
0
        return PyErr_NoMemory();
10186
32.8M
    newlength = perform(kind, data, length, tmp, &maxchar);
10187
32.8M
    res = PyUnicode_New(newlength, maxchar);
10188
32.8M
    if (res == NULL)
10189
0
        goto leave;
10190
32.8M
    tmpend = tmp + newlength;
10191
32.8M
    outdata = PyUnicode_DATA(res);
10192
32.8M
    outkind = PyUnicode_KIND(res);
10193
32.8M
    switch (outkind) {
10194
240k
    case PyUnicode_1BYTE_KIND:
10195
240k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10196
240k
        break;
10197
32.5M
    case PyUnicode_2BYTE_KIND:
10198
32.5M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10199
32.5M
        break;
10200
48.6k
    case PyUnicode_4BYTE_KIND:
10201
48.6k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10202
48.6k
        break;
10203
0
    default:
10204
0
        Py_UNREACHABLE();
10205
32.8M
    }
10206
32.8M
  leave:
10207
32.8M
    PyMem_Free(tmp);
10208
32.8M
    return res;
10209
32.8M
}
10210
10211
PyObject *
10212
PyUnicode_Join(PyObject *separator, PyObject *seq)
10213
20.7M
{
10214
20.7M
    PyObject *res;
10215
20.7M
    PyObject *fseq;
10216
20.7M
    Py_ssize_t seqlen;
10217
20.7M
    PyObject **items;
10218
10219
20.7M
    fseq = PySequence_Fast(seq, "can only join an iterable");
10220
20.7M
    if (fseq == NULL) {
10221
613
        return NULL;
10222
613
    }
10223
10224
20.7M
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
10225
10226
20.7M
    items = PySequence_Fast_ITEMS(fseq);
10227
20.7M
    seqlen = PySequence_Fast_GET_SIZE(fseq);
10228
20.7M
    res = _PyUnicode_JoinArray(separator, items, seqlen);
10229
10230
20.7M
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
10231
10232
20.7M
    Py_DECREF(fseq);
10233
20.7M
    return res;
10234
20.7M
}
10235
10236
PyObject *
10237
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
10238
48.9M
{
10239
48.9M
    PyObject *res = NULL; /* the result */
10240
48.9M
    PyObject *sep = NULL;
10241
48.9M
    Py_ssize_t seplen;
10242
48.9M
    PyObject *item;
10243
48.9M
    Py_ssize_t sz, i, res_offset;
10244
48.9M
    Py_UCS4 maxchar;
10245
48.9M
    Py_UCS4 item_maxchar;
10246
48.9M
    int use_memcpy;
10247
48.9M
    unsigned char *res_data = NULL, *sep_data = NULL;
10248
48.9M
    PyObject *last_obj;
10249
48.9M
    int kind = 0;
10250
10251
    /* If empty sequence, return u"". */
10252
48.9M
    if (seqlen == 0) {
10253
4.96M
        _Py_RETURN_UNICODE_EMPTY();
10254
4.96M
    }
10255
10256
    /* If singleton sequence with an exact Unicode, return that. */
10257
43.9M
    last_obj = NULL;
10258
43.9M
    if (seqlen == 1) {
10259
6.51M
        if (PyUnicode_CheckExact(items[0])) {
10260
5.02M
            res = items[0];
10261
5.02M
            return Py_NewRef(res);
10262
5.02M
        }
10263
1.48M
        seplen = 0;
10264
1.48M
        maxchar = 0;
10265
1.48M
    }
10266
37.4M
    else {
10267
        /* Set up sep and seplen */
10268
37.4M
        if (separator == NULL) {
10269
            /* fall back to a blank space separator */
10270
0
            sep = PyUnicode_FromOrdinal(' ');
10271
0
            if (!sep)
10272
0
                goto onError;
10273
0
            seplen = 1;
10274
0
            maxchar = 32;
10275
0
        }
10276
37.4M
        else {
10277
37.4M
            if (!PyUnicode_Check(separator)) {
10278
0
                PyErr_Format(PyExc_TypeError,
10279
0
                             "separator: expected str instance,"
10280
0
                             " %.80s found",
10281
0
                             Py_TYPE(separator)->tp_name);
10282
0
                goto onError;
10283
0
            }
10284
37.4M
            sep = separator;
10285
37.4M
            seplen = PyUnicode_GET_LENGTH(separator);
10286
37.4M
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10287
            /* inc refcount to keep this code path symmetric with the
10288
               above case of a blank separator */
10289
37.4M
            Py_INCREF(sep);
10290
37.4M
        }
10291
37.4M
        last_obj = sep;
10292
37.4M
    }
10293
10294
    /* There are at least two things to join, or else we have a subclass
10295
     * of str in the sequence.
10296
     * Do a pre-pass to figure out the total amount of space we'll
10297
     * need (sz), and see whether all argument are strings.
10298
     */
10299
38.9M
    sz = 0;
10300
#ifdef Py_DEBUG
10301
    use_memcpy = 0;
10302
#else
10303
38.9M
    use_memcpy = 1;
10304
38.9M
#endif
10305
354M
    for (i = 0; i < seqlen; i++) {
10306
315M
        size_t add_sz;
10307
315M
        item = items[i];
10308
315M
        if (!PyUnicode_Check(item)) {
10309
0
            PyErr_Format(PyExc_TypeError,
10310
0
                         "sequence item %zd: expected str instance,"
10311
0
                         " %.80s found",
10312
0
                         i, Py_TYPE(item)->tp_name);
10313
0
            goto onError;
10314
0
        }
10315
315M
        add_sz = PyUnicode_GET_LENGTH(item);
10316
315M
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10317
315M
        maxchar = Py_MAX(maxchar, item_maxchar);
10318
315M
        if (i != 0) {
10319
276M
            add_sz += seplen;
10320
276M
        }
10321
315M
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10322
0
            PyErr_SetString(PyExc_OverflowError,
10323
0
                            "join() result is too long for a Python string");
10324
0
            goto onError;
10325
0
        }
10326
315M
        sz += add_sz;
10327
315M
        if (use_memcpy && last_obj != NULL) {
10328
253M
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10329
5.22M
                use_memcpy = 0;
10330
253M
        }
10331
315M
        last_obj = item;
10332
315M
    }
10333
10334
38.9M
    res = PyUnicode_New(sz, maxchar);
10335
38.9M
    if (res == NULL)
10336
0
        goto onError;
10337
10338
    /* Catenate everything. */
10339
#ifdef Py_DEBUG
10340
    use_memcpy = 0;
10341
#else
10342
38.9M
    if (use_memcpy) {
10343
33.7M
        res_data = PyUnicode_1BYTE_DATA(res);
10344
33.7M
        kind = PyUnicode_KIND(res);
10345
33.7M
        if (seplen != 0)
10346
15.5k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10347
33.7M
    }
10348
38.9M
#endif
10349
38.9M
    if (use_memcpy) {
10350
255M
        for (i = 0; i < seqlen; ++i) {
10351
222M
            Py_ssize_t itemlen;
10352
222M
            item = items[i];
10353
10354
            /* Copy item, and maybe the separator. */
10355
222M
            if (i && seplen != 0) {
10356
21.8k
                memcpy(res_data,
10357
21.8k
                          sep_data,
10358
21.8k
                          kind * seplen);
10359
21.8k
                res_data += kind * seplen;
10360
21.8k
            }
10361
10362
222M
            itemlen = PyUnicode_GET_LENGTH(item);
10363
222M
            if (itemlen != 0) {
10364
187M
                memcpy(res_data,
10365
187M
                          PyUnicode_DATA(item),
10366
187M
                          kind * itemlen);
10367
187M
                res_data += kind * itemlen;
10368
187M
            }
10369
222M
        }
10370
33.7M
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10371
33.7M
                           + kind * PyUnicode_GET_LENGTH(res));
10372
33.7M
    }
10373
5.22M
    else {
10374
98.2M
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10375
93.0M
            Py_ssize_t itemlen;
10376
93.0M
            item = items[i];
10377
10378
            /* Copy item, and maybe the separator. */
10379
93.0M
            if (i && seplen != 0) {
10380
62.0k
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10381
62.0k
                res_offset += seplen;
10382
62.0k
            }
10383
10384
93.0M
            itemlen = PyUnicode_GET_LENGTH(item);
10385
93.0M
            if (itemlen != 0) {
10386
90.7M
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10387
90.7M
                res_offset += itemlen;
10388
90.7M
            }
10389
93.0M
        }
10390
5.22M
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10391
5.22M
    }
10392
10393
38.9M
    Py_XDECREF(sep);
10394
38.9M
    assert(_PyUnicode_CheckConsistency(res, 1));
10395
38.9M
    return res;
10396
10397
0
  onError:
10398
0
    Py_XDECREF(sep);
10399
0
    Py_XDECREF(res);
10400
0
    return NULL;
10401
38.9M
}
10402
10403
void
10404
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10405
                    Py_UCS4 fill_char)
10406
738
{
10407
738
    const int kind = PyUnicode_KIND(unicode);
10408
738
    void *data = PyUnicode_DATA(unicode);
10409
738
    assert(unicode_modifiable(unicode));
10410
738
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10411
738
    assert(start >= 0);
10412
738
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10413
738
    unicode_fill(kind, data, fill_char, start, length);
10414
738
}
10415
10416
Py_ssize_t
10417
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10418
               Py_UCS4 fill_char)
10419
738
{
10420
738
    Py_ssize_t maxlen;
10421
10422
738
    if (!PyUnicode_Check(unicode)) {
10423
0
        PyErr_BadInternalCall();
10424
0
        return -1;
10425
0
    }
10426
738
    if (unicode_check_modifiable(unicode))
10427
0
        return -1;
10428
10429
738
    if (start < 0) {
10430
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10431
0
        return -1;
10432
0
    }
10433
738
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10434
0
        PyErr_SetString(PyExc_ValueError,
10435
0
                         "fill character is bigger than "
10436
0
                         "the string maximum character");
10437
0
        return -1;
10438
0
    }
10439
10440
738
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10441
738
    length = Py_MIN(maxlen, length);
10442
738
    if (length <= 0)
10443
0
        return 0;
10444
10445
738
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10446
738
    return length;
10447
738
}
10448
10449
static PyObject *
10450
pad(PyObject *self,
10451
    Py_ssize_t left,
10452
    Py_ssize_t right,
10453
    Py_UCS4 fill)
10454
0
{
10455
0
    PyObject *u;
10456
0
    Py_UCS4 maxchar;
10457
0
    int kind;
10458
0
    void *data;
10459
10460
0
    if (left < 0)
10461
0
        left = 0;
10462
0
    if (right < 0)
10463
0
        right = 0;
10464
10465
0
    if (left == 0 && right == 0)
10466
0
        return unicode_result_unchanged(self);
10467
10468
0
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10469
0
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10470
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10471
0
        return NULL;
10472
0
    }
10473
0
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10474
0
    maxchar = Py_MAX(maxchar, fill);
10475
0
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10476
0
    if (!u)
10477
0
        return NULL;
10478
10479
0
    kind = PyUnicode_KIND(u);
10480
0
    data = PyUnicode_DATA(u);
10481
0
    if (left)
10482
0
        unicode_fill(kind, data, fill, 0, left);
10483
0
    if (right)
10484
0
        unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10485
0
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10486
0
    assert(_PyUnicode_CheckConsistency(u, 1));
10487
0
    return u;
10488
0
}
10489
10490
PyObject *
10491
PyUnicode_Splitlines(PyObject *string, int keepends)
10492
13.7k
{
10493
13.7k
    PyObject *list;
10494
10495
13.7k
    if (ensure_unicode(string) < 0)
10496
0
        return NULL;
10497
10498
13.7k
    switch (PyUnicode_KIND(string)) {
10499
3.55k
    case PyUnicode_1BYTE_KIND:
10500
3.55k
        if (PyUnicode_IS_ASCII(string))
10501
2.70k
            list = asciilib_splitlines(
10502
2.70k
                string, PyUnicode_1BYTE_DATA(string),
10503
2.70k
                PyUnicode_GET_LENGTH(string), keepends);
10504
848
        else
10505
848
            list = ucs1lib_splitlines(
10506
848
                string, PyUnicode_1BYTE_DATA(string),
10507
848
                PyUnicode_GET_LENGTH(string), keepends);
10508
3.55k
        break;
10509
7.15k
    case PyUnicode_2BYTE_KIND:
10510
7.15k
        list = ucs2lib_splitlines(
10511
7.15k
            string, PyUnicode_2BYTE_DATA(string),
10512
7.15k
            PyUnicode_GET_LENGTH(string), keepends);
10513
7.15k
        break;
10514
3.01k
    case PyUnicode_4BYTE_KIND:
10515
3.01k
        list = ucs4lib_splitlines(
10516
3.01k
            string, PyUnicode_4BYTE_DATA(string),
10517
3.01k
            PyUnicode_GET_LENGTH(string), keepends);
10518
3.01k
        break;
10519
0
    default:
10520
0
        Py_UNREACHABLE();
10521
13.7k
    }
10522
13.7k
    return list;
10523
13.7k
}
10524
10525
static PyObject *
10526
split(PyObject *self,
10527
      PyObject *substring,
10528
      Py_ssize_t maxcount)
10529
22.3M
{
10530
22.3M
    int kind1, kind2;
10531
22.3M
    const void *buf1, *buf2;
10532
22.3M
    Py_ssize_t len1, len2;
10533
22.3M
    PyObject* out;
10534
22.3M
    len1 = PyUnicode_GET_LENGTH(self);
10535
22.3M
    kind1 = PyUnicode_KIND(self);
10536
10537
22.3M
    if (substring == NULL) {
10538
142k
        if (maxcount < 0) {
10539
118k
            maxcount = (len1 - 1) / 2 + 1;
10540
118k
        }
10541
142k
        switch (kind1) {
10542
92.1k
        case PyUnicode_1BYTE_KIND:
10543
92.1k
            if (PyUnicode_IS_ASCII(self))
10544
67.8k
                return asciilib_split_whitespace(
10545
67.8k
                    self,  PyUnicode_1BYTE_DATA(self),
10546
67.8k
                    len1, maxcount
10547
67.8k
                    );
10548
24.2k
            else
10549
24.2k
                return ucs1lib_split_whitespace(
10550
24.2k
                    self,  PyUnicode_1BYTE_DATA(self),
10551
24.2k
                    len1, maxcount
10552
24.2k
                    );
10553
40.2k
        case PyUnicode_2BYTE_KIND:
10554
40.2k
            return ucs2lib_split_whitespace(
10555
40.2k
                self,  PyUnicode_2BYTE_DATA(self),
10556
40.2k
                len1, maxcount
10557
40.2k
                );
10558
10.0k
        case PyUnicode_4BYTE_KIND:
10559
10.0k
            return ucs4lib_split_whitespace(
10560
10.0k
                self,  PyUnicode_4BYTE_DATA(self),
10561
10.0k
                len1, maxcount
10562
10.0k
                );
10563
0
        default:
10564
0
            Py_UNREACHABLE();
10565
142k
        }
10566
142k
    }
10567
10568
22.2M
    kind2 = PyUnicode_KIND(substring);
10569
22.2M
    len2 = PyUnicode_GET_LENGTH(substring);
10570
22.2M
    if (maxcount < 0) {
10571
        // if len2 == 0, it will raise ValueError.
10572
11.0M
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10573
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10574
11.0M
        maxcount = maxcount < 0 ? len1 : maxcount;
10575
11.0M
    }
10576
22.2M
    if (kind1 < kind2 || len1 < len2) {
10577
6.02M
        out = PyList_New(1);
10578
6.02M
        if (out == NULL)
10579
0
            return NULL;
10580
6.02M
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10581
6.02M
        return out;
10582
6.02M
    }
10583
16.1M
    buf1 = PyUnicode_DATA(self);
10584
16.1M
    buf2 = PyUnicode_DATA(substring);
10585
16.1M
    if (kind2 != kind1) {
10586
188k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10587
188k
        if (!buf2)
10588
0
            return NULL;
10589
188k
    }
10590
10591
16.1M
    switch (kind1) {
10592
15.9M
    case PyUnicode_1BYTE_KIND:
10593
15.9M
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10594
14.9M
            out = asciilib_split(
10595
14.9M
                self,  buf1, len1, buf2, len2, maxcount);
10596
1.06M
        else
10597
1.06M
            out = ucs1lib_split(
10598
1.06M
                self,  buf1, len1, buf2, len2, maxcount);
10599
15.9M
        break;
10600
156k
    case PyUnicode_2BYTE_KIND:
10601
156k
        out = ucs2lib_split(
10602
156k
            self,  buf1, len1, buf2, len2, maxcount);
10603
156k
        break;
10604
31.4k
    case PyUnicode_4BYTE_KIND:
10605
31.4k
        out = ucs4lib_split(
10606
31.4k
            self,  buf1, len1, buf2, len2, maxcount);
10607
31.4k
        break;
10608
0
    default:
10609
0
        out = NULL;
10610
16.1M
    }
10611
16.1M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10612
16.1M
    if (kind2 != kind1)
10613
188k
        PyMem_Free((void *)buf2);
10614
16.1M
    return out;
10615
16.1M
}
10616
10617
static PyObject *
10618
rsplit(PyObject *self,
10619
       PyObject *substring,
10620
       Py_ssize_t maxcount)
10621
50
{
10622
50
    int kind1, kind2;
10623
50
    const void *buf1, *buf2;
10624
50
    Py_ssize_t len1, len2;
10625
50
    PyObject* out;
10626
10627
50
    len1 = PyUnicode_GET_LENGTH(self);
10628
50
    kind1 = PyUnicode_KIND(self);
10629
10630
50
    if (substring == NULL) {
10631
0
        if (maxcount < 0) {
10632
0
            maxcount = (len1 - 1) / 2 + 1;
10633
0
        }
10634
0
        switch (kind1) {
10635
0
        case PyUnicode_1BYTE_KIND:
10636
0
            if (PyUnicode_IS_ASCII(self))
10637
0
                return asciilib_rsplit_whitespace(
10638
0
                    self,  PyUnicode_1BYTE_DATA(self),
10639
0
                    len1, maxcount
10640
0
                    );
10641
0
            else
10642
0
                return ucs1lib_rsplit_whitespace(
10643
0
                    self,  PyUnicode_1BYTE_DATA(self),
10644
0
                    len1, maxcount
10645
0
                    );
10646
0
        case PyUnicode_2BYTE_KIND:
10647
0
            return ucs2lib_rsplit_whitespace(
10648
0
                self,  PyUnicode_2BYTE_DATA(self),
10649
0
                len1, maxcount
10650
0
                );
10651
0
        case PyUnicode_4BYTE_KIND:
10652
0
            return ucs4lib_rsplit_whitespace(
10653
0
                self,  PyUnicode_4BYTE_DATA(self),
10654
0
                len1, maxcount
10655
0
                );
10656
0
        default:
10657
0
            Py_UNREACHABLE();
10658
0
        }
10659
0
    }
10660
50
    kind2 = PyUnicode_KIND(substring);
10661
50
    len2 = PyUnicode_GET_LENGTH(substring);
10662
50
    if (maxcount < 0) {
10663
        // if len2 == 0, it will raise ValueError.
10664
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10665
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10666
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10667
0
    }
10668
50
    if (kind1 < kind2 || len1 < len2) {
10669
0
        out = PyList_New(1);
10670
0
        if (out == NULL)
10671
0
            return NULL;
10672
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10673
0
        return out;
10674
0
    }
10675
50
    buf1 = PyUnicode_DATA(self);
10676
50
    buf2 = PyUnicode_DATA(substring);
10677
50
    if (kind2 != kind1) {
10678
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10679
0
        if (!buf2)
10680
0
            return NULL;
10681
0
    }
10682
10683
50
    switch (kind1) {
10684
50
    case PyUnicode_1BYTE_KIND:
10685
50
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10686
50
            out = asciilib_rsplit(
10687
50
                self,  buf1, len1, buf2, len2, maxcount);
10688
0
        else
10689
0
            out = ucs1lib_rsplit(
10690
0
                self,  buf1, len1, buf2, len2, maxcount);
10691
50
        break;
10692
0
    case PyUnicode_2BYTE_KIND:
10693
0
        out = ucs2lib_rsplit(
10694
0
            self,  buf1, len1, buf2, len2, maxcount);
10695
0
        break;
10696
0
    case PyUnicode_4BYTE_KIND:
10697
0
        out = ucs4lib_rsplit(
10698
0
            self,  buf1, len1, buf2, len2, maxcount);
10699
0
        break;
10700
0
    default:
10701
0
        out = NULL;
10702
50
    }
10703
50
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10704
50
    if (kind2 != kind1)
10705
0
        PyMem_Free((void *)buf2);
10706
50
    return out;
10707
50
}
10708
10709
static Py_ssize_t
10710
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10711
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10712
146M
{
10713
146M
    switch (kind) {
10714
21.5M
    case PyUnicode_1BYTE_KIND:
10715
21.5M
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10716
17.9M
            return asciilib_find(buf1, len1, buf2, len2, offset);
10717
3.61M
        else
10718
3.61M
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10719
60.6M
    case PyUnicode_2BYTE_KIND:
10720
60.6M
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10721
63.7M
    case PyUnicode_4BYTE_KIND:
10722
63.7M
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10723
146M
    }
10724
146M
    Py_UNREACHABLE();
10725
146M
}
10726
10727
static Py_ssize_t
10728
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10729
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10730
41.0M
{
10731
41.0M
    switch (kind) {
10732
35.5M
    case PyUnicode_1BYTE_KIND:
10733
35.5M
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10734
5.38M
    case PyUnicode_2BYTE_KIND:
10735
5.38M
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10736
135k
    case PyUnicode_4BYTE_KIND:
10737
135k
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10738
41.0M
    }
10739
41.0M
    Py_UNREACHABLE();
10740
41.0M
}
10741
10742
static void
10743
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10744
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10745
1.43M
{
10746
1.43M
    int kind = PyUnicode_KIND(u);
10747
1.43M
    void *data = PyUnicode_DATA(u);
10748
1.43M
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10749
1.43M
    if (kind == PyUnicode_1BYTE_KIND) {
10750
539k
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10751
539k
                                      (Py_UCS1 *)data + len,
10752
539k
                                      u1, u2, maxcount);
10753
539k
    }
10754
892k
    else if (kind == PyUnicode_2BYTE_KIND) {
10755
877k
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10756
877k
                                      (Py_UCS2 *)data + len,
10757
877k
                                      u1, u2, maxcount);
10758
877k
    }
10759
15.4k
    else {
10760
15.4k
        assert(kind == PyUnicode_4BYTE_KIND);
10761
15.4k
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10762
15.4k
                                      (Py_UCS4 *)data + len,
10763
15.4k
                                      u1, u2, maxcount);
10764
15.4k
    }
10765
1.43M
}
10766
10767
static PyObject *
10768
replace(PyObject *self, PyObject *str1,
10769
        PyObject *str2, Py_ssize_t maxcount)
10770
78.7M
{
10771
78.7M
    PyObject *u;
10772
78.7M
    const char *sbuf = PyUnicode_DATA(self);
10773
78.7M
    const void *buf1 = PyUnicode_DATA(str1);
10774
78.7M
    const void *buf2 = PyUnicode_DATA(str2);
10775
78.7M
    int srelease = 0, release1 = 0, release2 = 0;
10776
78.7M
    int skind = PyUnicode_KIND(self);
10777
78.7M
    int kind1 = PyUnicode_KIND(str1);
10778
78.7M
    int kind2 = PyUnicode_KIND(str2);
10779
78.7M
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10780
78.7M
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10781
78.7M
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10782
78.7M
    int mayshrink;
10783
78.7M
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10784
10785
78.7M
    if (slen < len1)
10786
31.4M
        goto nothing;
10787
10788
47.2M
    if (maxcount < 0)
10789
47.2M
        maxcount = PY_SSIZE_T_MAX;
10790
0
    else if (maxcount == 0)
10791
0
        goto nothing;
10792
10793
47.2M
    if (str1 == str2)
10794
0
        goto nothing;
10795
10796
47.2M
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10797
47.2M
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10798
47.2M
    if (maxchar < maxchar_str1)
10799
        /* substring too wide to be present */
10800
0
        goto nothing;
10801
47.2M
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10802
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10803
       result string. */
10804
47.2M
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10805
47.2M
    maxchar = Py_MAX(maxchar, maxchar_str2);
10806
10807
47.2M
    if (len1 == len2) {
10808
        /* same length */
10809
6.23M
        if (len1 == 0)
10810
0
            goto nothing;
10811
6.23M
        if (len1 == 1) {
10812
            /* replace characters */
10813
6.23M
            Py_UCS4 u1, u2;
10814
6.23M
            Py_ssize_t pos;
10815
10816
6.23M
            u1 = PyUnicode_READ(kind1, buf1, 0);
10817
6.23M
            pos = findchar(sbuf, skind, slen, u1, 1);
10818
6.23M
            if (pos < 0)
10819
4.80M
                goto nothing;
10820
1.43M
            u2 = PyUnicode_READ(kind2, buf2, 0);
10821
1.43M
            u = PyUnicode_New(slen, maxchar);
10822
1.43M
            if (!u)
10823
0
                goto error;
10824
10825
1.43M
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10826
1.43M
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10827
1.43M
        }
10828
0
        else {
10829
0
            int rkind = skind;
10830
0
            char *res;
10831
0
            Py_ssize_t i;
10832
10833
0
            if (kind1 < rkind) {
10834
                /* widen substring */
10835
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10836
0
                if (!buf1) goto error;
10837
0
                release1 = 1;
10838
0
            }
10839
0
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10840
0
            if (i < 0)
10841
0
                goto nothing;
10842
0
            if (rkind > kind2) {
10843
                /* widen replacement */
10844
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10845
0
                if (!buf2) goto error;
10846
0
                release2 = 1;
10847
0
            }
10848
0
            else if (rkind < kind2) {
10849
                /* widen self and buf1 */
10850
0
                rkind = kind2;
10851
0
                if (release1) {
10852
0
                    assert(buf1 != PyUnicode_DATA(str1));
10853
0
                    PyMem_Free((void *)buf1);
10854
0
                    buf1 = PyUnicode_DATA(str1);
10855
0
                    release1 = 0;
10856
0
                }
10857
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10858
0
                if (!sbuf) goto error;
10859
0
                srelease = 1;
10860
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10861
0
                if (!buf1) goto error;
10862
0
                release1 = 1;
10863
0
            }
10864
0
            u = PyUnicode_New(slen, maxchar);
10865
0
            if (!u)
10866
0
                goto error;
10867
0
            assert(PyUnicode_KIND(u) == rkind);
10868
0
            res = PyUnicode_DATA(u);
10869
10870
0
            memcpy(res, sbuf, rkind * slen);
10871
            /* change everything in-place, starting with this one */
10872
0
            memcpy(res + rkind * i,
10873
0
                   buf2,
10874
0
                   rkind * len2);
10875
0
            i += len1;
10876
10877
0
            while ( --maxcount > 0) {
10878
0
                i = anylib_find(rkind, self,
10879
0
                                sbuf+rkind*i, slen-i,
10880
0
                                str1, buf1, len1, i);
10881
0
                if (i == -1)
10882
0
                    break;
10883
0
                memcpy(res + rkind * i,
10884
0
                       buf2,
10885
0
                       rkind * len2);
10886
0
                i += len1;
10887
0
            }
10888
0
        }
10889
6.23M
    }
10890
41.0M
    else {
10891
41.0M
        Py_ssize_t n, i, j, ires;
10892
41.0M
        Py_ssize_t new_size;
10893
41.0M
        int rkind = skind;
10894
41.0M
        char *res;
10895
10896
41.0M
        if (kind1 < rkind) {
10897
            /* widen substring */
10898
5.51M
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10899
5.51M
            if (!buf1) goto error;
10900
5.51M
            release1 = 1;
10901
5.51M
        }
10902
41.0M
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10903
41.0M
        if (n == 0)
10904
36.0M
            goto nothing;
10905
4.94M
        if (kind2 < rkind) {
10906
            /* widen replacement */
10907
1.12M
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10908
1.12M
            if (!buf2) goto error;
10909
1.12M
            release2 = 1;
10910
1.12M
        }
10911
3.81M
        else if (kind2 > rkind) {
10912
            /* widen self and buf1 */
10913
0
            rkind = kind2;
10914
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10915
0
            if (!sbuf) goto error;
10916
0
            srelease = 1;
10917
0
            if (release1) {
10918
0
                assert(buf1 != PyUnicode_DATA(str1));
10919
0
                PyMem_Free((void *)buf1);
10920
0
                buf1 = PyUnicode_DATA(str1);
10921
0
                release1 = 0;
10922
0
            }
10923
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10924
0
            if (!buf1) goto error;
10925
0
            release1 = 1;
10926
0
        }
10927
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10928
           PyUnicode_GET_LENGTH(str1)); */
10929
4.94M
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10930
0
                PyErr_SetString(PyExc_OverflowError,
10931
0
                                "replace string is too long");
10932
0
                goto error;
10933
0
        }
10934
4.94M
        new_size = slen + n * (len2 - len1);
10935
4.94M
        if (new_size == 0) {
10936
0
            u = unicode_get_empty();
10937
0
            goto done;
10938
0
        }
10939
4.94M
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10940
0
            PyErr_SetString(PyExc_OverflowError,
10941
0
                            "replace string is too long");
10942
0
            goto error;
10943
0
        }
10944
4.94M
        u = PyUnicode_New(new_size, maxchar);
10945
4.94M
        if (!u)
10946
0
            goto error;
10947
4.94M
        assert(PyUnicode_KIND(u) == rkind);
10948
4.94M
        res = PyUnicode_DATA(u);
10949
4.94M
        ires = i = 0;
10950
4.94M
        if (len1 > 0) {
10951
150M
            while (n-- > 0) {
10952
                /* look for next match */
10953
146M
                j = anylib_find(rkind, self,
10954
146M
                                sbuf + rkind * i, slen-i,
10955
146M
                                str1, buf1, len1, i);
10956
146M
                if (j == -1)
10957
0
                    break;
10958
146M
                else if (j > i) {
10959
                    /* copy unchanged part [i:j] */
10960
20.9M
                    memcpy(res + rkind * ires,
10961
20.9M
                           sbuf + rkind * i,
10962
20.9M
                           rkind * (j-i));
10963
20.9M
                    ires += j - i;
10964
20.9M
                }
10965
                /* copy substitution string */
10966
146M
                if (len2 > 0) {
10967
146M
                    memcpy(res + rkind * ires,
10968
146M
                           buf2,
10969
146M
                           rkind * len2);
10970
146M
                    ires += len2;
10971
146M
                }
10972
146M
                i = j + len1;
10973
146M
            }
10974
4.94M
            if (i < slen)
10975
                /* copy tail [i:] */
10976
4.85M
                memcpy(res + rkind * ires,
10977
4.85M
                       sbuf + rkind * i,
10978
4.85M
                       rkind * (slen-i));
10979
4.94M
        }
10980
0
        else {
10981
            /* interleave */
10982
0
            while (n > 0) {
10983
0
                memcpy(res + rkind * ires,
10984
0
                       buf2,
10985
0
                       rkind * len2);
10986
0
                ires += len2;
10987
0
                if (--n <= 0)
10988
0
                    break;
10989
0
                memcpy(res + rkind * ires,
10990
0
                       sbuf + rkind * i,
10991
0
                       rkind);
10992
0
                ires++;
10993
0
                i++;
10994
0
            }
10995
0
            memcpy(res + rkind * ires,
10996
0
                   sbuf + rkind * i,
10997
0
                   rkind * (slen-i));
10998
0
        }
10999
4.94M
    }
11000
11001
6.37M
    if (mayshrink) {
11002
0
        unicode_adjust_maxchar(&u);
11003
0
        if (u == NULL)
11004
0
            goto error;
11005
0
    }
11006
11007
6.37M
  done:
11008
6.37M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
11009
6.37M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11010
6.37M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11011
6.37M
    if (srelease)
11012
0
        PyMem_Free((void *)sbuf);
11013
6.37M
    if (release1)
11014
1.12M
        PyMem_Free((void *)buf1);
11015
6.37M
    if (release2)
11016
1.12M
        PyMem_Free((void *)buf2);
11017
6.37M
    assert(_PyUnicode_CheckConsistency(u, 1));
11018
6.37M
    return u;
11019
11020
72.3M
  nothing:
11021
    /* nothing to replace; return original string (when possible) */
11022
72.3M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
11023
72.3M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11024
72.3M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11025
72.3M
    if (srelease)
11026
0
        PyMem_Free((void *)sbuf);
11027
72.3M
    if (release1)
11028
4.38M
        PyMem_Free((void *)buf1);
11029
72.3M
    if (release2)
11030
0
        PyMem_Free((void *)buf2);
11031
72.3M
    return unicode_result_unchanged(self);
11032
11033
0
  error:
11034
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
11035
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11036
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11037
0
    if (srelease)
11038
0
        PyMem_Free((void *)sbuf);
11039
0
    if (release1)
11040
0
        PyMem_Free((void *)buf1);
11041
0
    if (release2)
11042
0
        PyMem_Free((void *)buf2);
11043
0
    return NULL;
11044
6.37M
}
11045
11046
/* --- Unicode Object Methods --------------------------------------------- */
11047
11048
/*[clinic input]
11049
@permit_long_docstring_body
11050
str.title as unicode_title
11051
11052
Return a version of the string where each word is titlecased.
11053
11054
More specifically, words start with uppercased characters and all remaining
11055
cased characters have lower case.
11056
[clinic start generated code]*/
11057
11058
static PyObject *
11059
unicode_title_impl(PyObject *self)
11060
/*[clinic end generated code: output=c75ae03809574902 input=533ce0eb6a7f5d1b]*/
11061
0
{
11062
0
    return case_operation(self, do_title);
11063
0
}
11064
11065
/*[clinic input]
11066
@permit_long_docstring_body
11067
str.capitalize as unicode_capitalize
11068
11069
Return a capitalized version of the string.
11070
11071
More specifically, make the first character have upper case and the rest lower
11072
case.
11073
[clinic start generated code]*/
11074
11075
static PyObject *
11076
unicode_capitalize_impl(PyObject *self)
11077
/*[clinic end generated code: output=e49a4c333cdb7667 input=a4a15ade41f6f9e9]*/
11078
0
{
11079
0
    if (PyUnicode_GET_LENGTH(self) == 0)
11080
0
        return unicode_result_unchanged(self);
11081
0
    return case_operation(self, do_capitalize);
11082
0
}
11083
11084
/*[clinic input]
11085
str.casefold as unicode_casefold
11086
11087
Return a version of the string suitable for caseless comparisons.
11088
[clinic start generated code]*/
11089
11090
static PyObject *
11091
unicode_casefold_impl(PyObject *self)
11092
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
11093
0
{
11094
0
    if (PyUnicode_IS_ASCII(self))
11095
0
        return ascii_upper_or_lower(self, 1);
11096
0
    return case_operation(self, do_casefold);
11097
0
}
11098
11099
11100
/* Argument converter. Accepts a single Unicode character. */
11101
11102
static int
11103
convert_uc(PyObject *obj, void *addr)
11104
0
{
11105
0
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
11106
11107
0
    if (!PyUnicode_Check(obj)) {
11108
0
        PyErr_Format(PyExc_TypeError,
11109
0
                     "The fill character must be a unicode character, "
11110
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
11111
0
        return 0;
11112
0
    }
11113
0
    if (PyUnicode_GET_LENGTH(obj) != 1) {
11114
0
        PyErr_SetString(PyExc_TypeError,
11115
0
                        "The fill character must be exactly one character long");
11116
0
        return 0;
11117
0
    }
11118
0
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
11119
0
    return 1;
11120
0
}
11121
11122
/*[clinic input]
11123
str.center as unicode_center
11124
11125
    width: Py_ssize_t
11126
    fillchar: Py_UCS4 = ' '
11127
    /
11128
11129
Return a centered string of length width.
11130
11131
Padding is done using the specified fill character (default is a space).
11132
[clinic start generated code]*/
11133
11134
static PyObject *
11135
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11136
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
11137
0
{
11138
0
    Py_ssize_t marg, left;
11139
11140
0
    if (PyUnicode_GET_LENGTH(self) >= width)
11141
0
        return unicode_result_unchanged(self);
11142
11143
0
    marg = width - PyUnicode_GET_LENGTH(self);
11144
0
    left = marg / 2 + (marg & width & 1);
11145
11146
0
    return pad(self, left, marg - left, fillchar);
11147
0
}
11148
11149
/* This function assumes that str1 and str2 are readied by the caller. */
11150
11151
static int
11152
unicode_compare(PyObject *str1, PyObject *str2)
11153
19.3M
{
11154
19.3M
#define COMPARE(TYPE1, TYPE2) \
11155
19.3M
    do { \
11156
18.3M
        TYPE1* p1 = (TYPE1 *)data1; \
11157
18.3M
        TYPE2* p2 = (TYPE2 *)data2; \
11158
18.3M
        TYPE1* end = p1 + len; \
11159
18.3M
        Py_UCS4 c1, c2; \
11160
18.3M
        for (; p1 != end; p1++, p2++) { \
11161
18.3M
            c1 = *p1; \
11162
18.3M
            c2 = *p2; \
11163
18.3M
            if (c1 != c2) \
11164
18.3M
                return (c1 < c2) ? -1 : 1; \
11165
18.3M
        } \
11166
18.3M
    } \
11167
18.3M
    while (0)
11168
11169
19.3M
    int kind1, kind2;
11170
19.3M
    const void *data1, *data2;
11171
19.3M
    Py_ssize_t len1, len2, len;
11172
11173
19.3M
    kind1 = PyUnicode_KIND(str1);
11174
19.3M
    kind2 = PyUnicode_KIND(str2);
11175
19.3M
    data1 = PyUnicode_DATA(str1);
11176
19.3M
    data2 = PyUnicode_DATA(str2);
11177
19.3M
    len1 = PyUnicode_GET_LENGTH(str1);
11178
19.3M
    len2 = PyUnicode_GET_LENGTH(str2);
11179
19.3M
    len = Py_MIN(len1, len2);
11180
11181
19.3M
    switch(kind1) {
11182
1.60M
    case PyUnicode_1BYTE_KIND:
11183
1.60M
    {
11184
1.60M
        switch(kind2) {
11185
74.7k
        case PyUnicode_1BYTE_KIND:
11186
74.7k
        {
11187
74.7k
            int cmp = memcmp(data1, data2, len);
11188
            /* normalize result of memcmp() into the range [-1; 1] */
11189
74.7k
            if (cmp < 0)
11190
47.0k
                return -1;
11191
27.6k
            if (cmp > 0)
11192
27.1k
                return 1;
11193
529
            break;
11194
27.6k
        }
11195
1.31M
        case PyUnicode_2BYTE_KIND:
11196
1.31M
            COMPARE(Py_UCS1, Py_UCS2);
11197
0
            break;
11198
214k
        case PyUnicode_4BYTE_KIND:
11199
214k
            COMPARE(Py_UCS1, Py_UCS4);
11200
0
            break;
11201
0
        default:
11202
0
            Py_UNREACHABLE();
11203
1.60M
        }
11204
529
        break;
11205
1.60M
    }
11206
16.2M
    case PyUnicode_2BYTE_KIND:
11207
16.2M
    {
11208
16.2M
        switch(kind2) {
11209
5.61k
        case PyUnicode_1BYTE_KIND:
11210
5.61k
            COMPARE(Py_UCS2, Py_UCS1);
11211
0
            break;
11212
14.5M
        case PyUnicode_2BYTE_KIND:
11213
14.5M
        {
11214
14.5M
            COMPARE(Py_UCS2, Py_UCS2);
11215
0
            break;
11216
14.5M
        }
11217
1.72M
        case PyUnicode_4BYTE_KIND:
11218
1.72M
            COMPARE(Py_UCS2, Py_UCS4);
11219
0
            break;
11220
0
        default:
11221
0
            Py_UNREACHABLE();
11222
16.2M
        }
11223
0
        break;
11224
16.2M
    }
11225
1.48M
    case PyUnicode_4BYTE_KIND:
11226
1.48M
    {
11227
1.48M
        switch(kind2) {
11228
729
        case PyUnicode_1BYTE_KIND:
11229
729
            COMPARE(Py_UCS4, Py_UCS1);
11230
0
            break;
11231
557k
        case PyUnicode_2BYTE_KIND:
11232
557k
            COMPARE(Py_UCS4, Py_UCS2);
11233
0
            break;
11234
928k
        case PyUnicode_4BYTE_KIND:
11235
928k
        {
11236
928k
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11237
928k
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11238
            /* normalize result of wmemcmp() into the range [-1; 1] */
11239
928k
            if (cmp < 0)
11240
458k
                return -1;
11241
470k
            if (cmp > 0)
11242
470k
                return 1;
11243
#else
11244
            COMPARE(Py_UCS4, Py_UCS4);
11245
#endif
11246
0
            break;
11247
470k
        }
11248
0
        default:
11249
0
            Py_UNREACHABLE();
11250
1.48M
        }
11251
0
        break;
11252
1.48M
    }
11253
0
    default:
11254
0
        Py_UNREACHABLE();
11255
19.3M
    }
11256
11257
529
    if (len1 == len2)
11258
526
        return 0;
11259
3
    if (len1 < len2)
11260
3
        return -1;
11261
0
    else
11262
0
        return 1;
11263
11264
3
#undef COMPARE
11265
3
}
11266
11267
11268
int
11269
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11270
265M
{
11271
265M
    assert(PyUnicode_Check(str1));
11272
265M
    assert(PyUnicode_Check(str2));
11273
265M
    if (str1 == str2) {
11274
74.2M
        return 1;
11275
74.2M
    }
11276
191M
    return unicode_eq(str1, str2);
11277
265M
}
11278
11279
11280
int
11281
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11282
0
{
11283
0
    if (!PyUnicode_Check(str1)) {
11284
0
        PyErr_Format(PyExc_TypeError,
11285
0
                     "first argument must be str, not %T", str1);
11286
0
        return -1;
11287
0
    }
11288
0
    if (!PyUnicode_Check(str2)) {
11289
0
        PyErr_Format(PyExc_TypeError,
11290
0
                     "second argument must be str, not %T", str2);
11291
0
        return -1;
11292
0
    }
11293
11294
0
    return _PyUnicode_Equal(str1, str2);
11295
0
}
11296
11297
11298
int
11299
PyUnicode_Compare(PyObject *left, PyObject *right)
11300
7.15k
{
11301
7.15k
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11302
        /* a string is equal to itself */
11303
7.15k
        if (left == right)
11304
0
            return 0;
11305
11306
7.15k
        return unicode_compare(left, right);
11307
7.15k
    }
11308
0
    PyErr_Format(PyExc_TypeError,
11309
0
                 "Can't compare %.100s and %.100s",
11310
0
                 Py_TYPE(left)->tp_name,
11311
0
                 Py_TYPE(right)->tp_name);
11312
0
    return -1;
11313
7.15k
}
11314
11315
int
11316
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11317
1.86M
{
11318
1.86M
    Py_ssize_t i;
11319
1.86M
    int kind;
11320
1.86M
    Py_UCS4 chr;
11321
11322
1.86M
    assert(_PyUnicode_CHECK(uni));
11323
1.86M
    kind = PyUnicode_KIND(uni);
11324
1.86M
    if (kind == PyUnicode_1BYTE_KIND) {
11325
1.86M
        const void *data = PyUnicode_1BYTE_DATA(uni);
11326
1.86M
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11327
1.86M
        size_t len, len2 = strlen(str);
11328
1.86M
        int cmp;
11329
11330
1.86M
        len = Py_MIN(len1, len2);
11331
1.86M
        cmp = memcmp(data, str, len);
11332
1.86M
        if (cmp != 0) {
11333
1.34M
            if (cmp < 0)
11334
7.16k
                return -1;
11335
1.33M
            else
11336
1.33M
                return 1;
11337
1.34M
        }
11338
519k
        if (len1 > len2)
11339
209
            return 1; /* uni is longer */
11340
518k
        if (len1 < len2)
11341
820
            return -1; /* str is longer */
11342
517k
        return 0;
11343
518k
    }
11344
1.51k
    else {
11345
1.51k
        const void *data = PyUnicode_DATA(uni);
11346
        /* Compare Unicode string and source character set string */
11347
2.78k
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11348
2.54k
            if (chr != (unsigned char)str[i])
11349
1.27k
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11350
        /* This check keeps Python strings that end in '\0' from comparing equal
11351
         to C strings identical up to that point. */
11352
240
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11353
240
            return 1; /* uni is longer */
11354
0
        if (str[i])
11355
0
            return -1; /* str is longer */
11356
0
        return 0;
11357
0
    }
11358
1.86M
}
11359
11360
int
11361
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11362
0
{
11363
0
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11364
0
}
11365
11366
int
11367
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11368
0
{
11369
0
    assert(_PyUnicode_CHECK(unicode));
11370
0
    assert(str);
11371
11372
0
    if (PyUnicode_IS_ASCII(unicode)) {
11373
0
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11374
0
        return size == len &&
11375
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11376
0
    }
11377
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11378
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11379
0
        return size == len &&
11380
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11381
0
    }
11382
11383
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11384
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11385
0
        return 0;
11386
0
    }
11387
0
    const unsigned char *s = (const unsigned char *)str;
11388
0
    const unsigned char *ends = s + (size_t)size;
11389
0
    int kind = PyUnicode_KIND(unicode);
11390
0
    const void *data = PyUnicode_DATA(unicode);
11391
    /* Compare Unicode string and UTF-8 string */
11392
0
    for (Py_ssize_t i = 0; i < len; i++) {
11393
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11394
0
        if (ch < 0x80) {
11395
0
            if (ends == s || s[0] != ch) {
11396
0
                return 0;
11397
0
            }
11398
0
            s += 1;
11399
0
        }
11400
0
        else if (ch < 0x800) {
11401
0
            if ((ends - s) < 2 ||
11402
0
                s[0] != (0xc0 | (ch >> 6)) ||
11403
0
                s[1] != (0x80 | (ch & 0x3f)))
11404
0
            {
11405
0
                return 0;
11406
0
            }
11407
0
            s += 2;
11408
0
        }
11409
0
        else if (ch < 0x10000) {
11410
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11411
0
                (ends - s) < 3 ||
11412
0
                s[0] != (0xe0 | (ch >> 12)) ||
11413
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11414
0
                s[2] != (0x80 | (ch & 0x3f)))
11415
0
            {
11416
0
                return 0;
11417
0
            }
11418
0
            s += 3;
11419
0
        }
11420
0
        else {
11421
0
            assert(ch <= MAX_UNICODE);
11422
0
            if ((ends - s) < 4 ||
11423
0
                s[0] != (0xf0 | (ch >> 18)) ||
11424
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11425
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11426
0
                s[3] != (0x80 | (ch & 0x3f)))
11427
0
            {
11428
0
                return 0;
11429
0
            }
11430
0
            s += 4;
11431
0
        }
11432
0
    }
11433
0
    return s == ends;
11434
0
}
11435
11436
int
11437
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11438
7.28M
{
11439
7.28M
    size_t len;
11440
7.28M
    assert(_PyUnicode_CHECK(unicode));
11441
7.28M
    assert(str);
11442
#ifndef NDEBUG
11443
    for (const char *p = str; *p; p++) {
11444
        assert((unsigned char)*p < 128);
11445
    }
11446
#endif
11447
7.28M
    if (!PyUnicode_IS_ASCII(unicode))
11448
152k
        return 0;
11449
7.13M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11450
7.13M
    return strlen(str) == len &&
11451
7.13M
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11452
7.28M
}
11453
11454
int
11455
_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11456
0
{
11457
0
    PyObject *right_uni;
11458
11459
0
    assert(_PyUnicode_CHECK(left));
11460
0
    assert(right->string);
11461
#ifndef NDEBUG
11462
    for (const char *p = right->string; *p; p++) {
11463
        assert((unsigned char)*p < 128);
11464
    }
11465
#endif
11466
11467
0
    if (!PyUnicode_IS_ASCII(left))
11468
0
        return 0;
11469
11470
0
    right_uni = _PyUnicode_FromId(right);       /* borrowed */
11471
0
    if (right_uni == NULL) {
11472
        /* memory error or bad data */
11473
0
        PyErr_Clear();
11474
0
        return _PyUnicode_EqualToASCIIString(left, right->string);
11475
0
    }
11476
11477
0
    if (left == right_uni)
11478
0
        return 1;
11479
11480
0
    assert(PyUnicode_CHECK_INTERNED(right_uni));
11481
0
    if (PyUnicode_CHECK_INTERNED(left)) {
11482
0
        return 0;
11483
0
    }
11484
11485
0
    Py_hash_t right_hash = PyUnicode_HASH(right_uni);
11486
0
    assert(right_hash != -1);
11487
0
    Py_hash_t hash = PyUnicode_HASH(left);
11488
0
    if (hash != -1 && hash != right_hash) {
11489
0
        return 0;
11490
0
    }
11491
11492
0
    return unicode_eq(left, right_uni);
11493
0
}
11494
11495
PyObject *
11496
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11497
28.9M
{
11498
28.9M
    int result;
11499
11500
28.9M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11501
92.7k
        Py_RETURN_NOTIMPLEMENTED;
11502
11503
28.8M
    if (left == right) {
11504
1.61k
        switch (op) {
11505
1.54k
        case Py_EQ:
11506
1.54k
        case Py_LE:
11507
1.54k
        case Py_GE:
11508
            /* a string is equal to itself */
11509
1.54k
            Py_RETURN_TRUE;
11510
68
        case Py_NE:
11511
68
        case Py_LT:
11512
68
        case Py_GT:
11513
68
            Py_RETURN_FALSE;
11514
0
        default:
11515
0
            PyErr_BadArgument();
11516
0
            return NULL;
11517
1.61k
        }
11518
1.61k
    }
11519
28.8M
    else if (op == Py_EQ || op == Py_NE) {
11520
9.53M
        result = unicode_eq(left, right);
11521
9.53M
        result ^= (op == Py_NE);
11522
9.53M
        return PyBool_FromLong(result);
11523
9.53M
    }
11524
19.3M
    else {
11525
19.3M
        result = unicode_compare(left, right);
11526
19.3M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11527
19.3M
    }
11528
28.8M
}
11529
11530
int
11531
PyUnicode_Contains(PyObject *str, PyObject *substr)
11532
91.6M
{
11533
91.6M
    int kind1, kind2;
11534
91.6M
    const void *buf1, *buf2;
11535
91.6M
    Py_ssize_t len1, len2;
11536
91.6M
    int result;
11537
11538
91.6M
    if (!PyUnicode_Check(substr)) {
11539
0
        PyErr_Format(PyExc_TypeError,
11540
0
                     "'in <string>' requires string as left operand, not %.100s",
11541
0
                     Py_TYPE(substr)->tp_name);
11542
0
        return -1;
11543
0
    }
11544
91.6M
    if (ensure_unicode(str) < 0)
11545
0
        return -1;
11546
11547
91.6M
    kind1 = PyUnicode_KIND(str);
11548
91.6M
    kind2 = PyUnicode_KIND(substr);
11549
91.6M
    if (kind1 < kind2)
11550
3.77M
        return 0;
11551
87.8M
    len1 = PyUnicode_GET_LENGTH(str);
11552
87.8M
    len2 = PyUnicode_GET_LENGTH(substr);
11553
87.8M
    if (len1 < len2)
11554
6.04M
        return 0;
11555
81.8M
    buf1 = PyUnicode_DATA(str);
11556
81.8M
    buf2 = PyUnicode_DATA(substr);
11557
81.8M
    if (len2 == 1) {
11558
81.8M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11559
81.8M
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11560
81.8M
        return result;
11561
81.8M
    }
11562
38.1k
    if (kind2 != kind1) {
11563
20.6k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11564
20.6k
        if (!buf2)
11565
0
            return -1;
11566
20.6k
    }
11567
11568
38.1k
    switch (kind1) {
11569
17.5k
    case PyUnicode_1BYTE_KIND:
11570
17.5k
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11571
17.5k
        break;
11572
15.8k
    case PyUnicode_2BYTE_KIND:
11573
15.8k
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11574
15.8k
        break;
11575
4.81k
    case PyUnicode_4BYTE_KIND:
11576
4.81k
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11577
4.81k
        break;
11578
0
    default:
11579
0
        Py_UNREACHABLE();
11580
38.1k
    }
11581
11582
38.1k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11583
38.1k
    if (kind2 != kind1)
11584
20.6k
        PyMem_Free((void *)buf2);
11585
11586
38.1k
    return result;
11587
38.1k
}
11588
11589
/* Concat to string or Unicode object giving a new Unicode object. */
11590
11591
PyObject *
11592
PyUnicode_Concat(PyObject *left, PyObject *right)
11593
28.4M
{
11594
28.4M
    PyObject *result;
11595
28.4M
    Py_UCS4 maxchar, maxchar2;
11596
28.4M
    Py_ssize_t left_len, right_len, new_len;
11597
11598
28.4M
    if (ensure_unicode(left) < 0)
11599
0
        return NULL;
11600
11601
28.4M
    if (!PyUnicode_Check(right)) {
11602
0
        PyErr_Format(PyExc_TypeError,
11603
0
            "can only concatenate str (not \"%.200s\") to str",
11604
0
            Py_TYPE(right)->tp_name);
11605
0
        return NULL;
11606
0
    }
11607
11608
    /* Shortcuts */
11609
28.4M
    PyObject *empty = unicode_get_empty();  // Borrowed reference
11610
28.4M
    if (left == empty) {
11611
72.3k
        return PyUnicode_FromObject(right);
11612
72.3k
    }
11613
28.3M
    if (right == empty) {
11614
4.22M
        return PyUnicode_FromObject(left);
11615
4.22M
    }
11616
11617
24.1M
    left_len = PyUnicode_GET_LENGTH(left);
11618
24.1M
    right_len = PyUnicode_GET_LENGTH(right);
11619
24.1M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11620
0
        PyErr_SetString(PyExc_OverflowError,
11621
0
                        "strings are too large to concat");
11622
0
        return NULL;
11623
0
    }
11624
24.1M
    new_len = left_len + right_len;
11625
11626
24.1M
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11627
24.1M
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11628
24.1M
    maxchar = Py_MAX(maxchar, maxchar2);
11629
11630
    /* Concat the two Unicode strings */
11631
24.1M
    result = PyUnicode_New(new_len, maxchar);
11632
24.1M
    if (result == NULL)
11633
0
        return NULL;
11634
24.1M
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11635
24.1M
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11636
24.1M
    assert(_PyUnicode_CheckConsistency(result, 1));
11637
24.1M
    return result;
11638
24.1M
}
11639
11640
void
11641
PyUnicode_Append(PyObject **p_left, PyObject *right)
11642
1.21M
{
11643
1.21M
    PyObject *left, *res;
11644
1.21M
    Py_UCS4 maxchar, maxchar2;
11645
1.21M
    Py_ssize_t left_len, right_len, new_len;
11646
11647
1.21M
    if (p_left == NULL) {
11648
0
        if (!PyErr_Occurred())
11649
0
            PyErr_BadInternalCall();
11650
0
        return;
11651
0
    }
11652
1.21M
    left = *p_left;
11653
1.21M
    if (right == NULL || left == NULL
11654
1.21M
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11655
0
        if (!PyErr_Occurred())
11656
0
            PyErr_BadInternalCall();
11657
0
        goto error;
11658
0
    }
11659
11660
    /* Shortcuts */
11661
1.21M
    PyObject *empty = unicode_get_empty();  // Borrowed reference
11662
1.21M
    if (left == empty) {
11663
423k
        Py_DECREF(left);
11664
423k
        *p_left = Py_NewRef(right);
11665
423k
        return;
11666
423k
    }
11667
791k
    if (right == empty) {
11668
5
        return;
11669
5
    }
11670
11671
791k
    left_len = PyUnicode_GET_LENGTH(left);
11672
791k
    right_len = PyUnicode_GET_LENGTH(right);
11673
791k
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11674
0
        PyErr_SetString(PyExc_OverflowError,
11675
0
                        "strings are too large to concat");
11676
0
        goto error;
11677
0
    }
11678
791k
    new_len = left_len + right_len;
11679
11680
791k
    if (unicode_modifiable(left)
11681
791k
        && PyUnicode_CheckExact(right)
11682
791k
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11683
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11684
           to change the structure size, but characters are stored just after
11685
           the structure, and so it requires to move all characters which is
11686
           not so different than duplicating the string. */
11687
791k
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11688
744k
    {
11689
        /* append inplace */
11690
744k
        if (unicode_resize(p_left, new_len) != 0)
11691
0
            goto error;
11692
11693
        /* copy 'right' into the newly allocated area of 'left' */
11694
744k
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11695
744k
    }
11696
46.5k
    else {
11697
46.5k
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11698
46.5k
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11699
46.5k
        maxchar = Py_MAX(maxchar, maxchar2);
11700
11701
        /* Concat the two Unicode strings */
11702
46.5k
        res = PyUnicode_New(new_len, maxchar);
11703
46.5k
        if (res == NULL)
11704
0
            goto error;
11705
46.5k
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11706
46.5k
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11707
46.5k
        Py_DECREF(left);
11708
46.5k
        *p_left = res;
11709
46.5k
    }
11710
791k
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11711
791k
    return;
11712
11713
0
error:
11714
0
    Py_CLEAR(*p_left);
11715
0
}
11716
11717
void
11718
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11719
0
{
11720
0
    PyUnicode_Append(pleft, right);
11721
0
    Py_XDECREF(right);
11722
0
}
11723
11724
/*[clinic input]
11725
@permit_long_summary
11726
@text_signature "($self, sub[, start[, end]], /)"
11727
str.count as unicode_count -> Py_ssize_t
11728
11729
    self as str: self
11730
    sub as substr: unicode
11731
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11732
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11733
    /
11734
11735
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11736
11737
Optional arguments start and end are interpreted as in slice notation.
11738
[clinic start generated code]*/
11739
11740
static Py_ssize_t
11741
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11742
                   Py_ssize_t end)
11743
/*[clinic end generated code: output=8fcc3aef0b18edbf input=8590716ee228b935]*/
11744
20.0M
{
11745
20.0M
    assert(PyUnicode_Check(str));
11746
20.0M
    assert(PyUnicode_Check(substr));
11747
11748
20.0M
    Py_ssize_t result;
11749
20.0M
    int kind1, kind2;
11750
20.0M
    const void *buf1 = NULL, *buf2 = NULL;
11751
20.0M
    Py_ssize_t len1, len2;
11752
11753
20.0M
    kind1 = PyUnicode_KIND(str);
11754
20.0M
    kind2 = PyUnicode_KIND(substr);
11755
20.0M
    if (kind1 < kind2)
11756
0
        return 0;
11757
11758
20.0M
    len1 = PyUnicode_GET_LENGTH(str);
11759
20.0M
    len2 = PyUnicode_GET_LENGTH(substr);
11760
20.0M
    ADJUST_INDICES(start, end, len1);
11761
20.0M
    if (end - start < len2)
11762
86.3k
        return 0;
11763
11764
20.0M
    buf1 = PyUnicode_DATA(str);
11765
20.0M
    buf2 = PyUnicode_DATA(substr);
11766
20.0M
    if (kind2 != kind1) {
11767
4.43M
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11768
4.43M
        if (!buf2)
11769
0
            goto onError;
11770
4.43M
    }
11771
11772
    // We don't reuse `anylib_count` here because of the explicit casts.
11773
20.0M
    switch (kind1) {
11774
15.5M
    case PyUnicode_1BYTE_KIND:
11775
15.5M
        result = ucs1lib_count(
11776
15.5M
            ((const Py_UCS1*)buf1) + start, end - start,
11777
15.5M
            buf2, len2, PY_SSIZE_T_MAX
11778
15.5M
            );
11779
15.5M
        break;
11780
3.68M
    case PyUnicode_2BYTE_KIND:
11781
3.68M
        result = ucs2lib_count(
11782
3.68M
            ((const Py_UCS2*)buf1) + start, end - start,
11783
3.68M
            buf2, len2, PY_SSIZE_T_MAX
11784
3.68M
            );
11785
3.68M
        break;
11786
742k
    case PyUnicode_4BYTE_KIND:
11787
742k
        result = ucs4lib_count(
11788
742k
            ((const Py_UCS4*)buf1) + start, end - start,
11789
742k
            buf2, len2, PY_SSIZE_T_MAX
11790
742k
            );
11791
742k
        break;
11792
0
    default:
11793
0
        Py_UNREACHABLE();
11794
20.0M
    }
11795
11796
20.0M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11797
20.0M
    if (kind2 != kind1)
11798
4.43M
        PyMem_Free((void *)buf2);
11799
11800
20.0M
    return result;
11801
0
  onError:
11802
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11803
0
    if (kind2 != kind1)
11804
0
        PyMem_Free((void *)buf2);
11805
0
    return -1;
11806
20.0M
}
11807
11808
/*[clinic input]
11809
str.encode as unicode_encode
11810
11811
    encoding: str(c_default="NULL") = 'utf-8'
11812
        The encoding in which to encode the string.
11813
    errors: str(c_default="NULL") = 'strict'
11814
        The error handling scheme to use for encoding errors.
11815
        The default is 'strict' meaning that encoding errors raise a
11816
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11817
        'xmlcharrefreplace' as well as any other name registered with
11818
        codecs.register_error that can handle UnicodeEncodeErrors.
11819
11820
Encode the string using the codec registered for encoding.
11821
[clinic start generated code]*/
11822
11823
static PyObject *
11824
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11825
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11826
14.7M
{
11827
14.7M
    return PyUnicode_AsEncodedString(self, encoding, errors);
11828
14.7M
}
11829
11830
/*[clinic input]
11831
str.expandtabs as unicode_expandtabs
11832
11833
    tabsize: int = 8
11834
11835
Return a copy where all tab characters are expanded using spaces.
11836
11837
If tabsize is not given, a tab size of 8 characters is assumed.
11838
[clinic start generated code]*/
11839
11840
static PyObject *
11841
unicode_expandtabs_impl(PyObject *self, int tabsize)
11842
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11843
6.52M
{
11844
6.52M
    Py_ssize_t i, j, line_pos, src_len, incr;
11845
6.52M
    Py_UCS4 ch;
11846
6.52M
    PyObject *u;
11847
6.52M
    const void *src_data;
11848
6.52M
    void *dest_data;
11849
6.52M
    int kind;
11850
6.52M
    int found;
11851
11852
    /* First pass: determine size of output string */
11853
6.52M
    src_len = PyUnicode_GET_LENGTH(self);
11854
6.52M
    i = j = line_pos = 0;
11855
6.52M
    kind = PyUnicode_KIND(self);
11856
6.52M
    src_data = PyUnicode_DATA(self);
11857
6.52M
    found = 0;
11858
125M
    for (; i < src_len; i++) {
11859
119M
        ch = PyUnicode_READ(kind, src_data, i);
11860
119M
        if (ch == '\t') {
11861
11.5M
            found = 1;
11862
11.5M
            if (tabsize > 0) {
11863
11.5M
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11864
11.5M
                if (j > PY_SSIZE_T_MAX - incr)
11865
0
                    goto overflow;
11866
11.5M
                line_pos += incr;
11867
11.5M
                j += incr;
11868
11.5M
            }
11869
11.5M
        }
11870
107M
        else {
11871
107M
            if (j > PY_SSIZE_T_MAX - 1)
11872
0
                goto overflow;
11873
107M
            line_pos++;
11874
107M
            j++;
11875
107M
            if (ch == '\n' || ch == '\r')
11876
12.8k
                line_pos = 0;
11877
107M
        }
11878
119M
    }
11879
6.52M
    if (!found)
11880
6.34M
        return unicode_result_unchanged(self);
11881
11882
    /* Second pass: create output string and fill it */
11883
171k
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11884
171k
    if (!u)
11885
0
        return NULL;
11886
171k
    dest_data = PyUnicode_DATA(u);
11887
11888
171k
    i = j = line_pos = 0;
11889
11890
30.6M
    for (; i < src_len; i++) {
11891
30.5M
        ch = PyUnicode_READ(kind, src_data, i);
11892
30.5M
        if (ch == '\t') {
11893
11.5M
            if (tabsize > 0) {
11894
11.5M
                incr = tabsize - (line_pos % tabsize);
11895
11.5M
                line_pos += incr;
11896
11.5M
                unicode_fill(kind, dest_data, ' ', j, incr);
11897
11.5M
                j += incr;
11898
11.5M
            }
11899
11.5M
        }
11900
18.9M
        else {
11901
18.9M
            line_pos++;
11902
18.9M
            PyUnicode_WRITE(kind, dest_data, j, ch);
11903
18.9M
            j++;
11904
18.9M
            if (ch == '\n' || ch == '\r')
11905
0
                line_pos = 0;
11906
18.9M
        }
11907
30.5M
    }
11908
171k
    assert (j == PyUnicode_GET_LENGTH(u));
11909
171k
    return unicode_result(u);
11910
11911
0
  overflow:
11912
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11913
0
    return NULL;
11914
171k
}
11915
11916
/*[clinic input]
11917
@permit_long_summary
11918
str.find as unicode_find = str.count
11919
11920
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11921
11922
Optional arguments start and end are interpreted as in slice notation.
11923
Return -1 on failure.
11924
[clinic start generated code]*/
11925
11926
static Py_ssize_t
11927
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11928
                  Py_ssize_t end)
11929
/*[clinic end generated code: output=51dbe6255712e278 input=3a9d650fe4c24695]*/
11930
16.9M
{
11931
16.9M
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11932
16.9M
    if (result < 0) {
11933
232k
        return -1;
11934
232k
    }
11935
16.6M
    return result;
11936
16.9M
}
11937
11938
static PyObject *
11939
unicode_getitem(PyObject *self, Py_ssize_t index)
11940
51.9M
{
11941
51.9M
    const void *data;
11942
51.9M
    int kind;
11943
51.9M
    Py_UCS4 ch;
11944
11945
51.9M
    if (!PyUnicode_Check(self)) {
11946
0
        PyErr_BadArgument();
11947
0
        return NULL;
11948
0
    }
11949
51.9M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11950
366
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11951
366
        return NULL;
11952
366
    }
11953
51.9M
    kind = PyUnicode_KIND(self);
11954
51.9M
    data = PyUnicode_DATA(self);
11955
51.9M
    ch = PyUnicode_READ(kind, data, index);
11956
51.9M
    return unicode_char(ch);
11957
51.9M
}
11958
11959
/* Believe it or not, this produces the same value for ASCII strings
11960
   as bytes_hash(). */
11961
static Py_hash_t
11962
unicode_hash(PyObject *self)
11963
45.9M
{
11964
45.9M
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11965
11966
#ifdef Py_DEBUG
11967
    assert(_Py_HashSecret_Initialized);
11968
#endif
11969
45.9M
    Py_hash_t hash = PyUnicode_HASH(self);
11970
45.9M
    if (hash != -1) {
11971
247k
        return hash;
11972
247k
    }
11973
45.7M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11974
45.7M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11975
11976
45.7M
    PyUnicode_SET_HASH(self, x);
11977
45.7M
    return x;
11978
45.9M
}
11979
11980
/*[clinic input]
11981
@permit_long_summary
11982
str.index as unicode_index = str.count
11983
11984
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11985
11986
Optional arguments start and end are interpreted as in slice notation.
11987
Raises ValueError when the substring is not found.
11988
[clinic start generated code]*/
11989
11990
static Py_ssize_t
11991
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11992
                   Py_ssize_t end)
11993
/*[clinic end generated code: output=77558288837cdf40 input=ae5e48f69ed75b06]*/
11994
556k
{
11995
556k
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11996
556k
    if (result == -1) {
11997
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
11998
0
    }
11999
556k
    else if (result < 0) {
12000
0
        return -1;
12001
0
    }
12002
556k
    return result;
12003
556k
}
12004
12005
/*[clinic input]
12006
str.isascii as unicode_isascii
12007
12008
Return True if all characters in the string are ASCII, False otherwise.
12009
12010
ASCII characters have code points in the range U+0000-U+007F.
12011
Empty string is ASCII too.
12012
[clinic start generated code]*/
12013
12014
static PyObject *
12015
unicode_isascii_impl(PyObject *self)
12016
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12017
9.66M
{
12018
9.66M
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12019
9.66M
}
12020
12021
/*[clinic input]
12022
@permit_long_docstring_body
12023
str.islower as unicode_islower
12024
12025
Return True if the string is a lowercase string, False otherwise.
12026
12027
A string is lowercase if all cased characters in the string are lowercase and
12028
there is at least one cased character in the string.
12029
[clinic start generated code]*/
12030
12031
static PyObject *
12032
unicode_islower_impl(PyObject *self)
12033
/*[clinic end generated code: output=dbd41995bd005b81 input=c6fc0295241a1aaa]*/
12034
0
{
12035
0
    Py_ssize_t i, length;
12036
0
    int kind;
12037
0
    const void *data;
12038
0
    int cased;
12039
12040
0
    length = PyUnicode_GET_LENGTH(self);
12041
0
    kind = PyUnicode_KIND(self);
12042
0
    data = PyUnicode_DATA(self);
12043
12044
    /* Shortcut for single character strings */
12045
0
    if (length == 1)
12046
0
        return PyBool_FromLong(
12047
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
12048
12049
    /* Special case for empty strings */
12050
0
    if (length == 0)
12051
0
        Py_RETURN_FALSE;
12052
12053
0
    cased = 0;
12054
0
    for (i = 0; i < length; i++) {
12055
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12056
12057
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
12058
0
            Py_RETURN_FALSE;
12059
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
12060
0
            cased = 1;
12061
0
    }
12062
0
    return PyBool_FromLong(cased);
12063
0
}
12064
12065
/*[clinic input]
12066
@permit_long_docstring_body
12067
str.isupper as unicode_isupper
12068
12069
Return True if the string is an uppercase string, False otherwise.
12070
12071
A string is uppercase if all cased characters in the string are uppercase and
12072
there is at least one cased character in the string.
12073
[clinic start generated code]*/
12074
12075
static PyObject *
12076
unicode_isupper_impl(PyObject *self)
12077
/*[clinic end generated code: output=049209c8e7f15f59 input=8d5cb33e67efde72]*/
12078
6.98k
{
12079
6.98k
    Py_ssize_t i, length;
12080
6.98k
    int kind;
12081
6.98k
    const void *data;
12082
6.98k
    int cased;
12083
12084
6.98k
    length = PyUnicode_GET_LENGTH(self);
12085
6.98k
    kind = PyUnicode_KIND(self);
12086
6.98k
    data = PyUnicode_DATA(self);
12087
12088
    /* Shortcut for single character strings */
12089
6.98k
    if (length == 1)
12090
0
        return PyBool_FromLong(
12091
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
12092
12093
    /* Special case for empty strings */
12094
6.98k
    if (length == 0)
12095
0
        Py_RETURN_FALSE;
12096
12097
6.98k
    cased = 0;
12098
89.1k
    for (i = 0; i < length; i++) {
12099
83.0k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12100
12101
83.0k
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
12102
840
            Py_RETURN_FALSE;
12103
82.1k
        else if (!cased && Py_UNICODE_ISUPPER(ch))
12104
6.23k
            cased = 1;
12105
83.0k
    }
12106
6.14k
    return PyBool_FromLong(cased);
12107
6.98k
}
12108
12109
/*[clinic input]
12110
str.istitle as unicode_istitle
12111
12112
Return True if the string is a title-cased string, False otherwise.
12113
12114
In a title-cased string, upper- and title-case characters may only
12115
follow uncased characters and lowercase characters only cased ones.
12116
[clinic start generated code]*/
12117
12118
static PyObject *
12119
unicode_istitle_impl(PyObject *self)
12120
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
12121
0
{
12122
0
    Py_ssize_t i, length;
12123
0
    int kind;
12124
0
    const void *data;
12125
0
    int cased, previous_is_cased;
12126
12127
0
    length = PyUnicode_GET_LENGTH(self);
12128
0
    kind = PyUnicode_KIND(self);
12129
0
    data = PyUnicode_DATA(self);
12130
12131
    /* Shortcut for single character strings */
12132
0
    if (length == 1) {
12133
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12134
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12135
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
12136
0
    }
12137
12138
    /* Special case for empty strings */
12139
0
    if (length == 0)
12140
0
        Py_RETURN_FALSE;
12141
12142
0
    cased = 0;
12143
0
    previous_is_cased = 0;
12144
0
    for (i = 0; i < length; i++) {
12145
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12146
12147
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12148
0
            if (previous_is_cased)
12149
0
                Py_RETURN_FALSE;
12150
0
            previous_is_cased = 1;
12151
0
            cased = 1;
12152
0
        }
12153
0
        else if (Py_UNICODE_ISLOWER(ch)) {
12154
0
            if (!previous_is_cased)
12155
0
                Py_RETURN_FALSE;
12156
0
            previous_is_cased = 1;
12157
0
            cased = 1;
12158
0
        }
12159
0
        else
12160
0
            previous_is_cased = 0;
12161
0
    }
12162
0
    return PyBool_FromLong(cased);
12163
0
}
12164
12165
/*[clinic input]
12166
@permit_long_docstring_body
12167
str.isspace as unicode_isspace
12168
12169
Return True if the string is a whitespace string, False otherwise.
12170
12171
A string is whitespace if all characters in the string are whitespace and there
12172
is at least one character in the string.
12173
[clinic start generated code]*/
12174
12175
static PyObject *
12176
unicode_isspace_impl(PyObject *self)
12177
/*[clinic end generated code: output=163a63bfa08ac2b9 input=44fe05e248c6e159]*/
12178
18.6M
{
12179
18.6M
    Py_ssize_t i, length;
12180
18.6M
    int kind;
12181
18.6M
    const void *data;
12182
12183
18.6M
    length = PyUnicode_GET_LENGTH(self);
12184
18.6M
    kind = PyUnicode_KIND(self);
12185
18.6M
    data = PyUnicode_DATA(self);
12186
12187
    /* Shortcut for single character strings */
12188
18.6M
    if (length == 1)
12189
18.6M
        return PyBool_FromLong(
12190
18.6M
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
12191
12192
    /* Special case for empty strings */
12193
0
    if (length == 0)
12194
0
        Py_RETURN_FALSE;
12195
12196
0
    for (i = 0; i < length; i++) {
12197
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12198
0
        if (!Py_UNICODE_ISSPACE(ch))
12199
0
            Py_RETURN_FALSE;
12200
0
    }
12201
0
    Py_RETURN_TRUE;
12202
0
}
12203
12204
/*[clinic input]
12205
@permit_long_docstring_body
12206
str.isalpha as unicode_isalpha
12207
12208
Return True if the string is an alphabetic string, False otherwise.
12209
12210
A string is alphabetic if all characters in the string are alphabetic and there
12211
is at least one character in the string.
12212
[clinic start generated code]*/
12213
12214
static PyObject *
12215
unicode_isalpha_impl(PyObject *self)
12216
/*[clinic end generated code: output=cc81b9ac3883ec4f input=c233000624a56e0d]*/
12217
0
{
12218
0
    Py_ssize_t i, length;
12219
0
    int kind;
12220
0
    const void *data;
12221
12222
0
    length = PyUnicode_GET_LENGTH(self);
12223
0
    kind = PyUnicode_KIND(self);
12224
0
    data = PyUnicode_DATA(self);
12225
12226
    /* Shortcut for single character strings */
12227
0
    if (length == 1)
12228
0
        return PyBool_FromLong(
12229
0
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
12230
12231
    /* Special case for empty strings */
12232
0
    if (length == 0)
12233
0
        Py_RETURN_FALSE;
12234
12235
0
    for (i = 0; i < length; i++) {
12236
0
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12237
0
            Py_RETURN_FALSE;
12238
0
    }
12239
0
    Py_RETURN_TRUE;
12240
0
}
12241
12242
/*[clinic input]
12243
@permit_long_docstring_body
12244
str.isalnum as unicode_isalnum
12245
12246
Return True if the string is an alpha-numeric string, False otherwise.
12247
12248
A string is alpha-numeric if all characters in the string are alpha-numeric and
12249
there is at least one character in the string.
12250
[clinic start generated code]*/
12251
12252
static PyObject *
12253
unicode_isalnum_impl(PyObject *self)
12254
/*[clinic end generated code: output=a5a23490ffc3660c input=5d63ba9c9bafdb6b]*/
12255
9.97M
{
12256
9.97M
    int kind;
12257
9.97M
    const void *data;
12258
9.97M
    Py_ssize_t len, i;
12259
12260
9.97M
    kind = PyUnicode_KIND(self);
12261
9.97M
    data = PyUnicode_DATA(self);
12262
9.97M
    len = PyUnicode_GET_LENGTH(self);
12263
12264
    /* Shortcut for single character strings */
12265
9.97M
    if (len == 1) {
12266
9.97M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12267
9.97M
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12268
9.97M
    }
12269
12270
    /* Special case for empty strings */
12271
0
    if (len == 0)
12272
0
        Py_RETURN_FALSE;
12273
12274
0
    for (i = 0; i < len; i++) {
12275
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12276
0
        if (!Py_UNICODE_ISALNUM(ch))
12277
0
            Py_RETURN_FALSE;
12278
0
    }
12279
0
    Py_RETURN_TRUE;
12280
0
}
12281
12282
/*[clinic input]
12283
@permit_long_docstring_body
12284
str.isdecimal as unicode_isdecimal
12285
12286
Return True if the string is a decimal string, False otherwise.
12287
12288
A string is a decimal string if all characters in the string are decimal and
12289
there is at least one character in the string.
12290
[clinic start generated code]*/
12291
12292
static PyObject *
12293
unicode_isdecimal_impl(PyObject *self)
12294
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=8e84a58b414935a3]*/
12295
0
{
12296
0
    Py_ssize_t i, length;
12297
0
    int kind;
12298
0
    const void *data;
12299
12300
0
    length = PyUnicode_GET_LENGTH(self);
12301
0
    kind = PyUnicode_KIND(self);
12302
0
    data = PyUnicode_DATA(self);
12303
12304
    /* Shortcut for single character strings */
12305
0
    if (length == 1)
12306
0
        return PyBool_FromLong(
12307
0
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12308
12309
    /* Special case for empty strings */
12310
0
    if (length == 0)
12311
0
        Py_RETURN_FALSE;
12312
12313
0
    for (i = 0; i < length; i++) {
12314
0
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12315
0
            Py_RETURN_FALSE;
12316
0
    }
12317
0
    Py_RETURN_TRUE;
12318
0
}
12319
12320
/*[clinic input]
12321
@permit_long_docstring_body
12322
str.isdigit as unicode_isdigit
12323
12324
Return True if the string is a digit string, False otherwise.
12325
12326
A string is a digit string if all characters in the string are digits and there
12327
is at least one character in the string.
12328
[clinic start generated code]*/
12329
12330
static PyObject *
12331
unicode_isdigit_impl(PyObject *self)
12332
/*[clinic end generated code: output=10a6985311da6858 input=99e284affb54d4a0]*/
12333
1.58M
{
12334
1.58M
    Py_ssize_t i, length;
12335
1.58M
    int kind;
12336
1.58M
    const void *data;
12337
12338
1.58M
    length = PyUnicode_GET_LENGTH(self);
12339
1.58M
    kind = PyUnicode_KIND(self);
12340
1.58M
    data = PyUnicode_DATA(self);
12341
12342
    /* Shortcut for single character strings */
12343
1.58M
    if (length == 1) {
12344
1.58M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12345
1.58M
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12346
1.58M
    }
12347
12348
    /* Special case for empty strings */
12349
306
    if (length == 0)
12350
0
        Py_RETURN_FALSE;
12351
12352
1.09k
    for (i = 0; i < length; i++) {
12353
786
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12354
0
            Py_RETURN_FALSE;
12355
786
    }
12356
306
    Py_RETURN_TRUE;
12357
306
}
12358
12359
/*[clinic input]
12360
@permit_long_docstring_body
12361
str.isnumeric as unicode_isnumeric
12362
12363
Return True if the string is a numeric string, False otherwise.
12364
12365
A string is numeric if all characters in the string are numeric and there is at
12366
least one character in the string.
12367
[clinic start generated code]*/
12368
12369
static PyObject *
12370
unicode_isnumeric_impl(PyObject *self)
12371
/*[clinic end generated code: output=9172a32d9013051a input=e9f5b6b8b29b0ee6]*/
12372
0
{
12373
0
    Py_ssize_t i, length;
12374
0
    int kind;
12375
0
    const void *data;
12376
12377
0
    length = PyUnicode_GET_LENGTH(self);
12378
0
    kind = PyUnicode_KIND(self);
12379
0
    data = PyUnicode_DATA(self);
12380
12381
    /* Shortcut for single character strings */
12382
0
    if (length == 1)
12383
0
        return PyBool_FromLong(
12384
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12385
12386
    /* Special case for empty strings */
12387
0
    if (length == 0)
12388
0
        Py_RETURN_FALSE;
12389
12390
0
    for (i = 0; i < length; i++) {
12391
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12392
0
            Py_RETURN_FALSE;
12393
0
    }
12394
0
    Py_RETURN_TRUE;
12395
0
}
12396
12397
Py_ssize_t
12398
_PyUnicode_ScanIdentifier(PyObject *self)
12399
13.3k
{
12400
13.3k
    Py_ssize_t i;
12401
13.3k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12402
13.3k
    if (len == 0) {
12403
        /* an empty string is not a valid identifier */
12404
0
        return 0;
12405
0
    }
12406
12407
13.3k
    int kind = PyUnicode_KIND(self);
12408
13.3k
    const void *data = PyUnicode_DATA(self);
12409
13.3k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12410
    /* PEP 3131 says that the first character must be in
12411
       XID_Start and subsequent characters in XID_Continue,
12412
       and for the ASCII range, the 2.x rules apply (i.e
12413
       start with letters and underscore, continue with
12414
       letters, digits, underscore). However, given the current
12415
       definition of XID_Start and XID_Continue, it is sufficient
12416
       to check just for these, except that _ must be allowed
12417
       as starting an identifier.  */
12418
13.3k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12419
461
        return 0;
12420
461
    }
12421
12422
65.0k
    for (i = 1; i < len; i++) {
12423
52.3k
        ch = PyUnicode_READ(kind, data, i);
12424
52.3k
        if (!_PyUnicode_IsXidContinue(ch)) {
12425
221
            return i;
12426
221
        }
12427
52.3k
    }
12428
12.6k
    return i;
12429
12.9k
}
12430
12431
int
12432
PyUnicode_IsIdentifier(PyObject *self)
12433
956
{
12434
956
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12435
956
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12436
    /* an empty string is not a valid identifier */
12437
956
    return len && i == len;
12438
956
}
12439
12440
/*[clinic input]
12441
@permit_long_docstring_body
12442
str.isidentifier as unicode_isidentifier
12443
12444
Return True if the string is a valid Python identifier, False otherwise.
12445
12446
Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12447
such as "def" or "class".
12448
[clinic start generated code]*/
12449
12450
static PyObject *
12451
unicode_isidentifier_impl(PyObject *self)
12452
/*[clinic end generated code: output=fe585a9666572905 input=86315dd889d7bd04]*/
12453
494
{
12454
494
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12455
494
}
12456
12457
/*[clinic input]
12458
@permit_long_summary
12459
str.isprintable as unicode_isprintable
12460
12461
Return True if all characters in the string are printable, False otherwise.
12462
12463
A character is printable if repr() may use it in its output.
12464
[clinic start generated code]*/
12465
12466
static PyObject *
12467
unicode_isprintable_impl(PyObject *self)
12468
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=18345ba847084ec5]*/
12469
1.52M
{
12470
1.52M
    Py_ssize_t i, length;
12471
1.52M
    int kind;
12472
1.52M
    const void *data;
12473
12474
1.52M
    length = PyUnicode_GET_LENGTH(self);
12475
1.52M
    kind = PyUnicode_KIND(self);
12476
1.52M
    data = PyUnicode_DATA(self);
12477
12478
    /* Shortcut for single character strings */
12479
1.52M
    if (length == 1)
12480
1.52M
        return PyBool_FromLong(
12481
1.52M
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12482
12483
0
    for (i = 0; i < length; i++) {
12484
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12485
0
            Py_RETURN_FALSE;
12486
0
        }
12487
0
    }
12488
0
    Py_RETURN_TRUE;
12489
0
}
12490
12491
/*[clinic input]
12492
@permit_long_docstring_body
12493
str.join as unicode_join
12494
12495
    iterable: object
12496
    /
12497
12498
Concatenate any number of strings.
12499
12500
The string whose method is called is inserted in between each given string.
12501
The result is returned as a new string.
12502
12503
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12504
[clinic start generated code]*/
12505
12506
static PyObject *
12507
unicode_join(PyObject *self, PyObject *iterable)
12508
/*[clinic end generated code: output=6857e7cecfe7bf98 input=bac724ed412ef3f8]*/
12509
19.3M
{
12510
19.3M
    return PyUnicode_Join(self, iterable);
12511
19.3M
}
12512
12513
static Py_ssize_t
12514
unicode_length(PyObject *self)
12515
39.2M
{
12516
39.2M
    return PyUnicode_GET_LENGTH(self);
12517
39.2M
}
12518
12519
/*[clinic input]
12520
str.ljust as unicode_ljust
12521
12522
    width: Py_ssize_t
12523
    fillchar: Py_UCS4 = ' '
12524
    /
12525
12526
Return a left-justified string of length width.
12527
12528
Padding is done using the specified fill character (default is a space).
12529
[clinic start generated code]*/
12530
12531
static PyObject *
12532
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12533
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12534
0
{
12535
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12536
0
        return unicode_result_unchanged(self);
12537
12538
0
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12539
0
}
12540
12541
/*[clinic input]
12542
str.lower as unicode_lower
12543
12544
Return a copy of the string converted to lowercase.
12545
[clinic start generated code]*/
12546
12547
static PyObject *
12548
unicode_lower_impl(PyObject *self)
12549
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12550
110M
{
12551
110M
    if (PyUnicode_IS_ASCII(self))
12552
77.4M
        return ascii_upper_or_lower(self, 1);
12553
32.8M
    return case_operation(self, do_lower);
12554
110M
}
12555
12556
58.9M
#define LEFTSTRIP 0
12557
76.0M
#define RIGHTSTRIP 1
12558
37.2M
#define BOTHSTRIP 2
12559
12560
/* Arrays indexed by above */
12561
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12562
12563
0
#define STRIPNAME(i) (stripfuncnames[i])
12564
12565
/* externally visible for str.strip(unicode) */
12566
PyObject *
12567
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12568
7.91M
{
12569
7.91M
    const void *data;
12570
7.91M
    int kind;
12571
7.91M
    Py_ssize_t i, j, len;
12572
7.91M
    BLOOM_MASK sepmask;
12573
7.91M
    Py_ssize_t seplen;
12574
12575
7.91M
    kind = PyUnicode_KIND(self);
12576
7.91M
    data = PyUnicode_DATA(self);
12577
7.91M
    len = PyUnicode_GET_LENGTH(self);
12578
7.91M
    seplen = PyUnicode_GET_LENGTH(sepobj);
12579
7.91M
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12580
7.91M
                              PyUnicode_DATA(sepobj),
12581
7.91M
                              seplen);
12582
12583
7.91M
    i = 0;
12584
7.91M
    if (striptype != RIGHTSTRIP) {
12585
468k
        while (i < len) {
12586
466k
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12587
466k
            if (!BLOOM(sepmask, ch))
12588
435k
                break;
12589
31.5k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12590
1.70k
                break;
12591
29.8k
            i++;
12592
29.8k
        }
12593
438k
    }
12594
12595
7.91M
    j = len;
12596
7.91M
    if (striptype != LEFTSTRIP) {
12597
7.47M
        j--;
12598
7.90M
        while (j >= i) {
12599
3.46M
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12600
3.46M
            if (!BLOOM(sepmask, ch))
12601
3.00M
                break;
12602
463k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12603
26.6k
                break;
12604
436k
            j--;
12605
436k
        }
12606
12607
7.47M
        j++;
12608
7.47M
    }
12609
12610
7.91M
    return PyUnicode_Substring(self, i, j);
12611
7.91M
}
12612
12613
PyObject*
12614
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12615
288M
{
12616
288M
    const unsigned char *data;
12617
288M
    int kind;
12618
288M
    Py_ssize_t length;
12619
12620
288M
    length = PyUnicode_GET_LENGTH(self);
12621
288M
    end = Py_MIN(end, length);
12622
12623
288M
    if (start == 0 && end == length)
12624
51.7M
        return unicode_result_unchanged(self);
12625
12626
236M
    if (start < 0 || end < 0) {
12627
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12628
0
        return NULL;
12629
0
    }
12630
236M
    if (start >= length || end < start)
12631
175k
        _Py_RETURN_UNICODE_EMPTY();
12632
12633
236M
    length = end - start;
12634
236M
    if (PyUnicode_IS_ASCII(self)) {
12635
49.6M
        data = PyUnicode_1BYTE_DATA(self);
12636
49.6M
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12637
49.6M
    }
12638
186M
    else {
12639
186M
        kind = PyUnicode_KIND(self);
12640
186M
        data = PyUnicode_1BYTE_DATA(self);
12641
186M
        return PyUnicode_FromKindAndData(kind,
12642
186M
                                         data + kind * start,
12643
186M
                                         length);
12644
186M
    }
12645
236M
}
12646
12647
static PyObject *
12648
do_strip(PyObject *self, int striptype)
12649
49.5M
{
12650
49.5M
    Py_ssize_t len, i, j;
12651
12652
49.5M
    len = PyUnicode_GET_LENGTH(self);
12653
12654
49.5M
    if (PyUnicode_IS_ASCII(self)) {
12655
41.0M
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12656
12657
41.0M
        i = 0;
12658
41.0M
        if (striptype != RIGHTSTRIP) {
12659
31.7M
            while (i < len) {
12660
19.3M
                Py_UCS1 ch = data[i];
12661
19.3M
                if (!_Py_ascii_whitespace[ch])
12662
18.9M
                    break;
12663
391k
                i++;
12664
391k
            }
12665
31.3M
        }
12666
12667
41.0M
        j = len;
12668
41.0M
        if (striptype != LEFTSTRIP) {
12669
40.7M
            j--;
12670
51.3M
            while (j >= i) {
12671
33.4M
                Py_UCS1 ch = data[j];
12672
33.4M
                if (!_Py_ascii_whitespace[ch])
12673
22.8M
                    break;
12674
10.5M
                j--;
12675
10.5M
            }
12676
40.7M
            j++;
12677
40.7M
        }
12678
41.0M
    }
12679
8.46M
    else {
12680
8.46M
        int kind = PyUnicode_KIND(self);
12681
8.46M
        const void *data = PyUnicode_DATA(self);
12682
12683
8.46M
        i = 0;
12684
8.46M
        if (striptype != RIGHTSTRIP) {
12685
8.47M
            while (i < len) {
12686
8.46M
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12687
8.46M
                if (!Py_UNICODE_ISSPACE(ch))
12688
6.99M
                    break;
12689
1.46M
                i++;
12690
1.46M
            }
12691
7.00M
        }
12692
12693
8.46M
        j = len;
12694
8.46M
        if (striptype != LEFTSTRIP) {
12695
7.61M
            j--;
12696
8.33M
            while (j >= i) {
12697
8.29M
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12698
8.29M
                if (!Py_UNICODE_ISSPACE(ch))
12699
7.57M
                    break;
12700
724k
                j--;
12701
724k
            }
12702
7.61M
            j++;
12703
7.61M
        }
12704
8.46M
    }
12705
12706
49.5M
    return PyUnicode_Substring(self, i, j);
12707
49.5M
}
12708
12709
12710
static PyObject *
12711
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12712
57.4M
{
12713
57.4M
    if (sep != Py_None) {
12714
7.91M
        if (PyUnicode_Check(sep))
12715
7.91M
            return _PyUnicode_XStrip(self, striptype, sep);
12716
0
        else {
12717
0
            PyErr_Format(PyExc_TypeError,
12718
0
                         "%s arg must be None or str",
12719
0
                         STRIPNAME(striptype));
12720
0
            return NULL;
12721
0
        }
12722
7.91M
    }
12723
12724
49.5M
    return do_strip(self, striptype);
12725
57.4M
}
12726
12727
12728
/*[clinic input]
12729
@permit_long_summary
12730
str.strip as unicode_strip
12731
12732
    chars: object = None
12733
    /
12734
12735
Return a copy of the string with leading and trailing whitespace removed.
12736
12737
If chars is given and not None, remove characters in chars instead.
12738
[clinic start generated code]*/
12739
12740
static PyObject *
12741
unicode_strip_impl(PyObject *self, PyObject *chars)
12742
/*[clinic end generated code: output=ca19018454345d57 input=8bc6353450345fbd]*/
12743
37.2M
{
12744
37.2M
    return do_argstrip(self, BOTHSTRIP, chars);
12745
37.2M
}
12746
12747
12748
/*[clinic input]
12749
str.lstrip as unicode_lstrip
12750
12751
    chars: object = None
12752
    /
12753
12754
Return a copy of the string with leading whitespace removed.
12755
12756
If chars is given and not None, remove characters in chars instead.
12757
[clinic start generated code]*/
12758
12759
static PyObject *
12760
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12761
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12762
1.55M
{
12763
1.55M
    return do_argstrip(self, LEFTSTRIP, chars);
12764
1.55M
}
12765
12766
12767
/*[clinic input]
12768
str.rstrip as unicode_rstrip
12769
12770
    chars: object = None
12771
    /
12772
12773
Return a copy of the string with trailing whitespace removed.
12774
12775
If chars is given and not None, remove characters in chars instead.
12776
[clinic start generated code]*/
12777
12778
static PyObject *
12779
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12780
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12781
18.6M
{
12782
18.6M
    return do_argstrip(self, RIGHTSTRIP, chars);
12783
18.6M
}
12784
12785
12786
static PyObject*
12787
unicode_repeat(PyObject *str, Py_ssize_t len)
12788
429k
{
12789
429k
    PyObject *u;
12790
429k
    Py_ssize_t nchars, n;
12791
12792
429k
    if (len < 1)
12793
37.8k
        _Py_RETURN_UNICODE_EMPTY();
12794
12795
    /* no repeat, return original string */
12796
392k
    if (len == 1)
12797
129k
        return unicode_result_unchanged(str);
12798
12799
262k
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12800
0
        PyErr_SetString(PyExc_OverflowError,
12801
0
                        "repeated string is too long");
12802
0
        return NULL;
12803
0
    }
12804
262k
    nchars = len * PyUnicode_GET_LENGTH(str);
12805
12806
262k
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12807
262k
    if (!u)
12808
0
        return NULL;
12809
262k
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12810
12811
262k
    if (PyUnicode_GET_LENGTH(str) == 1) {
12812
259k
        int kind = PyUnicode_KIND(str);
12813
259k
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12814
259k
        if (kind == PyUnicode_1BYTE_KIND) {
12815
259k
            void *to = PyUnicode_DATA(u);
12816
259k
            memset(to, (unsigned char)fill_char, len);
12817
259k
        }
12818
0
        else if (kind == PyUnicode_2BYTE_KIND) {
12819
0
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12820
0
            for (n = 0; n < len; ++n)
12821
0
                ucs2[n] = fill_char;
12822
0
        } else {
12823
0
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12824
0
            assert(kind == PyUnicode_4BYTE_KIND);
12825
0
            for (n = 0; n < len; ++n)
12826
0
                ucs4[n] = fill_char;
12827
0
        }
12828
259k
    }
12829
3.02k
    else {
12830
3.02k
        Py_ssize_t char_size = PyUnicode_KIND(str);
12831
3.02k
        char *to = (char *) PyUnicode_DATA(u);
12832
3.02k
        _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12833
3.02k
            PyUnicode_GET_LENGTH(str) * char_size);
12834
3.02k
    }
12835
12836
262k
    assert(_PyUnicode_CheckConsistency(u, 1));
12837
262k
    return u;
12838
262k
}
12839
12840
PyObject *
12841
PyUnicode_Replace(PyObject *str,
12842
                  PyObject *substr,
12843
                  PyObject *replstr,
12844
                  Py_ssize_t maxcount)
12845
2
{
12846
2
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12847
2
            ensure_unicode(replstr) < 0)
12848
0
        return NULL;
12849
2
    return replace(str, substr, replstr, maxcount);
12850
2
}
12851
12852
/*[clinic input]
12853
@permit_long_docstring_body
12854
str.replace as unicode_replace
12855
12856
    old: unicode
12857
    new: unicode
12858
    /
12859
    count: Py_ssize_t = -1
12860
        Maximum number of occurrences to replace.
12861
        -1 (the default value) means replace all occurrences.
12862
12863
Return a copy with all occurrences of substring old replaced by new.
12864
12865
If the optional argument count is given, only the first count occurrences are
12866
replaced.
12867
[clinic start generated code]*/
12868
12869
static PyObject *
12870
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12871
                     Py_ssize_t count)
12872
/*[clinic end generated code: output=b63f1a8b5eebf448 input=f27ca92ac46b65a1]*/
12873
78.7M
{
12874
78.7M
    return replace(self, old, new, count);
12875
78.7M
}
12876
12877
/*[clinic input]
12878
@permit_long_docstring_body
12879
str.removeprefix as unicode_removeprefix
12880
12881
    prefix: unicode
12882
    /
12883
12884
Return a str with the given prefix string removed if present.
12885
12886
If the string starts with the prefix string, return string[len(prefix):].
12887
Otherwise, return a copy of the original string.
12888
[clinic start generated code]*/
12889
12890
static PyObject *
12891
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12892
/*[clinic end generated code: output=f1e5945e9763bcb9 input=1989a856dbb813f1]*/
12893
0
{
12894
0
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12895
0
    if (match == -1) {
12896
0
        return NULL;
12897
0
    }
12898
0
    if (match) {
12899
0
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12900
0
                                   PyUnicode_GET_LENGTH(self));
12901
0
    }
12902
0
    return unicode_result_unchanged(self);
12903
0
}
12904
12905
/*[clinic input]
12906
str.removesuffix as unicode_removesuffix
12907
12908
    suffix: unicode
12909
    /
12910
12911
Return a str with the given suffix string removed if present.
12912
12913
If the string ends with the suffix string and that suffix is not empty,
12914
return string[:-len(suffix)]. Otherwise, return a copy of the original
12915
string.
12916
[clinic start generated code]*/
12917
12918
static PyObject *
12919
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12920
/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12921
0
{
12922
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12923
0
    if (match == -1) {
12924
0
        return NULL;
12925
0
    }
12926
0
    if (match) {
12927
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12928
0
                                            - PyUnicode_GET_LENGTH(suffix));
12929
0
    }
12930
0
    return unicode_result_unchanged(self);
12931
0
}
12932
12933
static PyObject *
12934
unicode_repr(PyObject *unicode)
12935
3.55M
{
12936
3.55M
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12937
3.55M
    const void *idata = PyUnicode_DATA(unicode);
12938
12939
    /* Compute length of output, quote characters, and
12940
       maximum character */
12941
3.55M
    Py_ssize_t osize = 0;
12942
3.55M
    Py_UCS4 maxch = 127;
12943
3.55M
    Py_ssize_t squote = 0;
12944
3.55M
    Py_ssize_t dquote = 0;
12945
3.55M
    int ikind = PyUnicode_KIND(unicode);
12946
128M
    for (Py_ssize_t i = 0; i < isize; i++) {
12947
124M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12948
124M
        Py_ssize_t incr = 1;
12949
124M
        switch (ch) {
12950
173k
        case '\'': squote++; break;
12951
472k
        case '"':  dquote++; break;
12952
117k
        case '\\': case '\t': case '\r': case '\n':
12953
117k
            incr = 2;
12954
117k
            break;
12955
123M
        default:
12956
            /* Fast-path ASCII */
12957
123M
            if (ch < ' ' || ch == 0x7f)
12958
77.6M
                incr = 4; /* \xHH */
12959
46.0M
            else if (ch < 0x7f)
12960
38.8M
                ;
12961
7.27M
            else if (Py_UNICODE_ISPRINTABLE(ch))
12962
7.17M
                maxch = (ch > maxch) ? ch : maxch;
12963
93.9k
            else if (ch < 0x100)
12964
26.2k
                incr = 4; /* \xHH */
12965
67.7k
            else if (ch < 0x10000)
12966
45.0k
                incr = 6; /* \uHHHH */
12967
22.7k
            else
12968
22.7k
                incr = 10; /* \uHHHHHHHH */
12969
124M
        }
12970
124M
        if (osize > PY_SSIZE_T_MAX - incr) {
12971
0
            PyErr_SetString(PyExc_OverflowError,
12972
0
                            "string is too long to generate repr");
12973
0
            return NULL;
12974
0
        }
12975
124M
        osize += incr;
12976
124M
    }
12977
12978
3.55M
    Py_UCS4 quote = '\'';
12979
3.55M
    int changed = (osize != isize);
12980
3.55M
    if (squote) {
12981
75.0k
        changed = 1;
12982
75.0k
        if (dquote)
12983
            /* Both squote and dquote present. Use squote,
12984
               and escape them */
12985
7.18k
            osize += squote;
12986
67.8k
        else
12987
67.8k
            quote = '"';
12988
75.0k
    }
12989
3.55M
    osize += 2;   /* quotes */
12990
12991
3.55M
    PyObject *repr = PyUnicode_New(osize, maxch);
12992
3.55M
    if (repr == NULL)
12993
0
        return NULL;
12994
3.55M
    int okind = PyUnicode_KIND(repr);
12995
3.55M
    void *odata = PyUnicode_DATA(repr);
12996
12997
3.55M
    if (!changed) {
12998
3.01M
        PyUnicode_WRITE(okind, odata, 0, quote);
12999
13000
3.01M
        _PyUnicode_FastCopyCharacters(repr, 1,
13001
3.01M
                                      unicode, 0,
13002
3.01M
                                      isize);
13003
13004
3.01M
        PyUnicode_WRITE(okind, odata, osize-1, quote);
13005
3.01M
    }
13006
547k
    else {
13007
547k
        switch (okind) {
13008
347k
        case PyUnicode_1BYTE_KIND:
13009
347k
            ucs1lib_repr(unicode, quote, odata);
13010
347k
            break;
13011
196k
        case PyUnicode_2BYTE_KIND:
13012
196k
            ucs2lib_repr(unicode, quote, odata);
13013
196k
            break;
13014
3.36k
        default:
13015
3.36k
            assert(okind == PyUnicode_4BYTE_KIND);
13016
3.36k
            ucs4lib_repr(unicode, quote, odata);
13017
547k
        }
13018
547k
    }
13019
13020
3.55M
    assert(_PyUnicode_CheckConsistency(repr, 1));
13021
3.55M
    return repr;
13022
3.55M
}
13023
13024
/*[clinic input]
13025
@permit_long_summary
13026
str.rfind as unicode_rfind = str.count
13027
13028
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
13029
13030
Optional arguments start and end are interpreted as in slice notation.
13031
Return -1 on failure.
13032
[clinic start generated code]*/
13033
13034
static Py_ssize_t
13035
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
13036
                   Py_ssize_t end)
13037
/*[clinic end generated code: output=880b29f01dd014c8 input=7f7e97d5cd3299a2]*/
13038
10.2k
{
13039
10.2k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
13040
10.2k
    if (result < 0) {
13041
6.92k
        return -1;
13042
6.92k
    }
13043
3.31k
    return result;
13044
10.2k
}
13045
13046
/*[clinic input]
13047
@permit_long_summary
13048
str.rindex as unicode_rindex = str.count
13049
13050
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
13051
13052
Optional arguments start and end are interpreted as in slice notation.
13053
Raises ValueError when the substring is not found.
13054
[clinic start generated code]*/
13055
13056
static Py_ssize_t
13057
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
13058
                    Py_ssize_t end)
13059
/*[clinic end generated code: output=5f3aef124c867fe1 input=0363a324740b3e62]*/
13060
143k
{
13061
143k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
13062
143k
    if (result == -1) {
13063
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
13064
0
    }
13065
143k
    else if (result < 0) {
13066
0
        return -1;
13067
0
    }
13068
143k
    return result;
13069
143k
}
13070
13071
/*[clinic input]
13072
str.rjust as unicode_rjust
13073
13074
    width: Py_ssize_t
13075
    fillchar: Py_UCS4 = ' '
13076
    /
13077
13078
Return a right-justified string of length width.
13079
13080
Padding is done using the specified fill character (default is a space).
13081
[clinic start generated code]*/
13082
13083
static PyObject *
13084
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13085
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
13086
0
{
13087
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13088
0
        return unicode_result_unchanged(self);
13089
13090
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
13091
0
}
13092
13093
PyObject *
13094
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13095
0
{
13096
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13097
0
        return NULL;
13098
13099
0
    return split(s, sep, maxsplit);
13100
0
}
13101
13102
/*[clinic input]
13103
@permit_long_summary
13104
str.split as unicode_split
13105
13106
    sep: object = None
13107
        The separator used to split the string.
13108
13109
        When set to None (the default value), will split on any whitespace
13110
        character (including \n \r \t \f and spaces) and will discard
13111
        empty strings from the result.
13112
    maxsplit: Py_ssize_t = -1
13113
        Maximum number of splits.
13114
        -1 (the default value) means no limit.
13115
13116
Return a list of the substrings in the string, using sep as the separator string.
13117
13118
Splitting starts at the front of the string and works to the end.
13119
13120
Note, str.split() is mainly useful for data that has been intentionally
13121
delimited.  With natural text that includes punctuation, consider using
13122
the regular expression module.
13123
13124
[clinic start generated code]*/
13125
13126
static PyObject *
13127
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13128
/*[clinic end generated code: output=3a65b1db356948dc input=2c1fd08a78e038b8]*/
13129
22.3M
{
13130
22.3M
    if (sep == Py_None)
13131
142k
        return split(self, NULL, maxsplit);
13132
22.2M
    if (PyUnicode_Check(sep))
13133
22.2M
        return split(self, sep, maxsplit);
13134
13135
0
    PyErr_Format(PyExc_TypeError,
13136
0
                 "must be str or None, not %.100s",
13137
0
                 Py_TYPE(sep)->tp_name);
13138
0
    return NULL;
13139
22.2M
}
13140
13141
PyObject *
13142
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
13143
6.29M
{
13144
6.29M
    PyObject* out;
13145
6.29M
    int kind1, kind2;
13146
6.29M
    const void *buf1, *buf2;
13147
6.29M
    Py_ssize_t len1, len2;
13148
13149
6.29M
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13150
0
        return NULL;
13151
13152
6.29M
    kind1 = PyUnicode_KIND(str_obj);
13153
6.29M
    kind2 = PyUnicode_KIND(sep_obj);
13154
6.29M
    len1 = PyUnicode_GET_LENGTH(str_obj);
13155
6.29M
    len2 = PyUnicode_GET_LENGTH(sep_obj);
13156
6.29M
    if (kind1 < kind2 || len1 < len2) {
13157
1.00k
        PyObject *empty = unicode_get_empty();  // Borrowed reference
13158
1.00k
        return PyTuple_Pack(3, str_obj, empty, empty);
13159
1.00k
    }
13160
6.29M
    buf1 = PyUnicode_DATA(str_obj);
13161
6.29M
    buf2 = PyUnicode_DATA(sep_obj);
13162
6.29M
    if (kind2 != kind1) {
13163
77.4k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
13164
77.4k
        if (!buf2)
13165
0
            return NULL;
13166
77.4k
    }
13167
13168
6.29M
    switch (kind1) {
13169
6.21M
    case PyUnicode_1BYTE_KIND:
13170
6.21M
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13171
2.16M
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13172
4.05M
        else
13173
4.05M
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13174
6.21M
        break;
13175
68.2k
    case PyUnicode_2BYTE_KIND:
13176
68.2k
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13177
68.2k
        break;
13178
9.23k
    case PyUnicode_4BYTE_KIND:
13179
9.23k
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13180
9.23k
        break;
13181
0
    default:
13182
0
        Py_UNREACHABLE();
13183
6.29M
    }
13184
13185
6.29M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13186
6.29M
    if (kind2 != kind1)
13187
77.4k
        PyMem_Free((void *)buf2);
13188
13189
6.29M
    return out;
13190
6.29M
}
13191
13192
13193
PyObject *
13194
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
13195
9.41k
{
13196
9.41k
    PyObject* out;
13197
9.41k
    int kind1, kind2;
13198
9.41k
    const void *buf1, *buf2;
13199
9.41k
    Py_ssize_t len1, len2;
13200
13201
9.41k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13202
0
        return NULL;
13203
13204
9.41k
    kind1 = PyUnicode_KIND(str_obj);
13205
9.41k
    kind2 = PyUnicode_KIND(sep_obj);
13206
9.41k
    len1 = PyUnicode_GET_LENGTH(str_obj);
13207
9.41k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
13208
9.41k
    if (kind1 < kind2 || len1 < len2) {
13209
0
        PyObject *empty = unicode_get_empty();  // Borrowed reference
13210
0
        return PyTuple_Pack(3, empty, empty, str_obj);
13211
0
    }
13212
9.41k
    buf1 = PyUnicode_DATA(str_obj);
13213
9.41k
    buf2 = PyUnicode_DATA(sep_obj);
13214
9.41k
    if (kind2 != kind1) {
13215
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
13216
0
        if (!buf2)
13217
0
            return NULL;
13218
0
    }
13219
13220
9.41k
    switch (kind1) {
13221
9.41k
    case PyUnicode_1BYTE_KIND:
13222
9.41k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13223
9.41k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13224
0
        else
13225
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13226
9.41k
        break;
13227
0
    case PyUnicode_2BYTE_KIND:
13228
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13229
0
        break;
13230
0
    case PyUnicode_4BYTE_KIND:
13231
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13232
0
        break;
13233
0
    default:
13234
0
        Py_UNREACHABLE();
13235
9.41k
    }
13236
13237
9.41k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13238
9.41k
    if (kind2 != kind1)
13239
0
        PyMem_Free((void *)buf2);
13240
13241
9.41k
    return out;
13242
9.41k
}
13243
13244
/*[clinic input]
13245
@permit_long_docstring_body
13246
str.partition as unicode_partition
13247
13248
    sep: object
13249
    /
13250
13251
Partition the string into three parts using the given separator.
13252
13253
This will search for the separator in the string.  If the separator is found,
13254
returns a 3-tuple containing the part before the separator, the separator
13255
itself, and the part after it.
13256
13257
If the separator is not found, returns a 3-tuple containing the original string
13258
and two empty strings.
13259
[clinic start generated code]*/
13260
13261
static PyObject *
13262
unicode_partition(PyObject *self, PyObject *sep)
13263
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=4d854b520d7b0e97]*/
13264
6.29M
{
13265
6.29M
    return PyUnicode_Partition(self, sep);
13266
6.29M
}
13267
13268
/*[clinic input]
13269
@permit_long_docstring_body
13270
str.rpartition as unicode_rpartition = str.partition
13271
13272
Partition the string into three parts using the given separator.
13273
13274
This will search for the separator in the string, starting at the end. If
13275
the separator is found, returns a 3-tuple containing the part before the
13276
separator, the separator itself, and the part after it.
13277
13278
If the separator is not found, returns a 3-tuple containing two empty strings
13279
and the original string.
13280
[clinic start generated code]*/
13281
13282
static PyObject *
13283
unicode_rpartition(PyObject *self, PyObject *sep)
13284
/*[clinic end generated code: output=1aa13cf1156572aa input=a6adabe91e75b486]*/
13285
9.41k
{
13286
9.41k
    return PyUnicode_RPartition(self, sep);
13287
9.41k
}
13288
13289
PyObject *
13290
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13291
0
{
13292
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13293
0
        return NULL;
13294
13295
0
    return rsplit(s, sep, maxsplit);
13296
0
}
13297
13298
/*[clinic input]
13299
@permit_long_summary
13300
str.rsplit as unicode_rsplit = str.split
13301
13302
Return a list of the substrings in the string, using sep as the separator string.
13303
13304
Splitting starts at the end of the string and works to the front.
13305
[clinic start generated code]*/
13306
13307
static PyObject *
13308
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13309
/*[clinic end generated code: output=c2b815c63bcabffc input=0f762e30d267fa83]*/
13310
50
{
13311
50
    if (sep == Py_None)
13312
0
        return rsplit(self, NULL, maxsplit);
13313
50
    if (PyUnicode_Check(sep))
13314
50
        return rsplit(self, sep, maxsplit);
13315
13316
0
    PyErr_Format(PyExc_TypeError,
13317
0
                 "must be str or None, not %.100s",
13318
0
                 Py_TYPE(sep)->tp_name);
13319
0
    return NULL;
13320
50
}
13321
13322
/*[clinic input]
13323
@permit_long_docstring_body
13324
str.splitlines as unicode_splitlines
13325
13326
    keepends: bool = False
13327
13328
Return a list of the lines in the string, breaking at line boundaries.
13329
13330
Line breaks are not included in the resulting list unless keepends is given and
13331
true.
13332
[clinic start generated code]*/
13333
13334
static PyObject *
13335
unicode_splitlines_impl(PyObject *self, int keepends)
13336
/*[clinic end generated code: output=f664dcdad153ec40 input=39eeafbfef61c827]*/
13337
13.7k
{
13338
13.7k
    return PyUnicode_Splitlines(self, keepends);
13339
13.7k
}
13340
13341
static
13342
PyObject *unicode_str(PyObject *self)
13343
2.85M
{
13344
2.85M
    return unicode_result_unchanged(self);
13345
2.85M
}
13346
13347
/*[clinic input]
13348
@permit_long_summary
13349
str.swapcase as unicode_swapcase
13350
13351
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13352
[clinic start generated code]*/
13353
13354
static PyObject *
13355
unicode_swapcase_impl(PyObject *self)
13356
/*[clinic end generated code: output=5d28966bf6d7b2af input=85bc39a9b4e8ee91]*/
13357
0
{
13358
0
    return case_operation(self, do_swapcase);
13359
0
}
13360
13361
/*[clinic input]
13362
13363
@staticmethod
13364
str.maketrans as unicode_maketrans
13365
13366
  x: object
13367
13368
  y: unicode=NULL
13369
13370
  z: unicode=NULL
13371
13372
  /
13373
13374
Return a translation table usable for str.translate().
13375
13376
If there is only one argument, it must be a dictionary mapping Unicode
13377
ordinals (integers) or characters to Unicode ordinals, strings or None.
13378
Character keys will be then converted to ordinals.
13379
If there are two arguments, they must be strings of equal length, and
13380
in the resulting dictionary, each character in x will be mapped to the
13381
character at the same position in y. If there is a third argument, it
13382
must be a string, whose characters will be mapped to None in the result.
13383
[clinic start generated code]*/
13384
13385
static PyObject *
13386
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13387
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13388
0
{
13389
0
    PyObject *new = NULL, *key, *value;
13390
0
    Py_ssize_t i = 0;
13391
0
    int res;
13392
13393
0
    new = PyDict_New();
13394
0
    if (!new)
13395
0
        return NULL;
13396
0
    if (y != NULL) {
13397
0
        int x_kind, y_kind, z_kind;
13398
0
        const void *x_data, *y_data, *z_data;
13399
13400
        /* x must be a string too, of equal length */
13401
0
        if (!PyUnicode_Check(x)) {
13402
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13403
0
                            "be a string if there is a second argument");
13404
0
            goto err;
13405
0
        }
13406
0
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13407
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13408
0
                            "arguments must have equal length");
13409
0
            goto err;
13410
0
        }
13411
        /* create entries for translating chars in x to those in y */
13412
0
        x_kind = PyUnicode_KIND(x);
13413
0
        y_kind = PyUnicode_KIND(y);
13414
0
        x_data = PyUnicode_DATA(x);
13415
0
        y_data = PyUnicode_DATA(y);
13416
0
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13417
0
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13418
0
            if (!key)
13419
0
                goto err;
13420
0
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13421
0
            if (!value) {
13422
0
                Py_DECREF(key);
13423
0
                goto err;
13424
0
            }
13425
0
            res = PyDict_SetItem(new, key, value);
13426
0
            Py_DECREF(key);
13427
0
            Py_DECREF(value);
13428
0
            if (res < 0)
13429
0
                goto err;
13430
0
        }
13431
        /* create entries for deleting chars in z */
13432
0
        if (z != NULL) {
13433
0
            z_kind = PyUnicode_KIND(z);
13434
0
            z_data = PyUnicode_DATA(z);
13435
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13436
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13437
0
                if (!key)
13438
0
                    goto err;
13439
0
                res = PyDict_SetItem(new, key, Py_None);
13440
0
                Py_DECREF(key);
13441
0
                if (res < 0)
13442
0
                    goto err;
13443
0
            }
13444
0
        }
13445
0
    } else {
13446
0
        int kind;
13447
0
        const void *data;
13448
13449
        /* x must be a dict */
13450
0
        if (!PyDict_CheckExact(x)) {
13451
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13452
0
                            "to maketrans it must be a dict");
13453
0
            goto err;
13454
0
        }
13455
        /* copy entries into the new dict, converting string keys to int keys */
13456
0
        while (PyDict_Next(x, &i, &key, &value)) {
13457
0
            if (PyUnicode_Check(key)) {
13458
                /* convert string keys to integer keys */
13459
0
                PyObject *newkey;
13460
0
                if (PyUnicode_GET_LENGTH(key) != 1) {
13461
0
                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13462
0
                                    "table must be of length 1");
13463
0
                    goto err;
13464
0
                }
13465
0
                kind = PyUnicode_KIND(key);
13466
0
                data = PyUnicode_DATA(key);
13467
0
                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13468
0
                if (!newkey)
13469
0
                    goto err;
13470
0
                res = PyDict_SetItem(new, newkey, value);
13471
0
                Py_DECREF(newkey);
13472
0
                if (res < 0)
13473
0
                    goto err;
13474
0
            } else if (PyLong_Check(key)) {
13475
                /* just keep integer keys */
13476
0
                if (PyDict_SetItem(new, key, value) < 0)
13477
0
                    goto err;
13478
0
            } else {
13479
0
                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13480
0
                                "be strings or integers");
13481
0
                goto err;
13482
0
            }
13483
0
        }
13484
0
    }
13485
0
    return new;
13486
0
  err:
13487
0
    Py_DECREF(new);
13488
0
    return NULL;
13489
0
}
13490
13491
/*[clinic input]
13492
@permit_long_docstring_body
13493
str.translate as unicode_translate
13494
13495
    table: object
13496
        Translation table, which must be a mapping of Unicode ordinals to
13497
        Unicode ordinals, strings, or None.
13498
    /
13499
13500
Replace each character in the string using the given translation table.
13501
13502
The table must implement lookup/indexing via __getitem__, for instance a
13503
dictionary or list.  If this operation raises LookupError, the character is
13504
left untouched.  Characters mapped to None are deleted.
13505
[clinic start generated code]*/
13506
13507
static PyObject *
13508
unicode_translate(PyObject *self, PyObject *table)
13509
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=699e5fa0ebf9f5e9]*/
13510
104
{
13511
104
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13512
104
}
13513
13514
/*[clinic input]
13515
str.upper as unicode_upper
13516
13517
Return a copy of the string converted to uppercase.
13518
[clinic start generated code]*/
13519
13520
static PyObject *
13521
unicode_upper_impl(PyObject *self)
13522
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13523
102
{
13524
102
    if (PyUnicode_IS_ASCII(self))
13525
102
        return ascii_upper_or_lower(self, 0);
13526
0
    return case_operation(self, do_upper);
13527
102
}
13528
13529
/*[clinic input]
13530
@permit_long_summary
13531
str.zfill as unicode_zfill
13532
13533
    width: Py_ssize_t
13534
    /
13535
13536
Pad a numeric string with zeros on the left, to fill a field of the given width.
13537
13538
The string is never truncated.
13539
[clinic start generated code]*/
13540
13541
static PyObject *
13542
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13543
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=25a4ee0ea3e58ce0]*/
13544
0
{
13545
0
    Py_ssize_t fill;
13546
0
    PyObject *u;
13547
0
    int kind;
13548
0
    const void *data;
13549
0
    Py_UCS4 chr;
13550
13551
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13552
0
        return unicode_result_unchanged(self);
13553
13554
0
    fill = width - PyUnicode_GET_LENGTH(self);
13555
13556
0
    u = pad(self, fill, 0, '0');
13557
13558
0
    if (u == NULL)
13559
0
        return NULL;
13560
13561
0
    kind = PyUnicode_KIND(u);
13562
0
    data = PyUnicode_DATA(u);
13563
0
    chr = PyUnicode_READ(kind, data, fill);
13564
13565
0
    if (chr == '+' || chr == '-') {
13566
        /* move sign to beginning of string */
13567
0
        PyUnicode_WRITE(kind, data, 0, chr);
13568
0
        PyUnicode_WRITE(kind, data, fill, '0');
13569
0
    }
13570
13571
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13572
0
    return u;
13573
0
}
13574
13575
/*[clinic input]
13576
@permit_long_summary
13577
@text_signature "($self, prefix[, start[, end]], /)"
13578
str.startswith as unicode_startswith
13579
13580
    prefix as subobj: object
13581
        A string or a tuple of strings to try.
13582
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13583
        Optional start position. Default: start of the string.
13584
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13585
        Optional stop position. Default: end of the string.
13586
    /
13587
13588
Return True if the string starts with the specified prefix, False otherwise.
13589
[clinic start generated code]*/
13590
13591
static PyObject *
13592
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13593
                        Py_ssize_t end)
13594
/*[clinic end generated code: output=4bd7cfd0803051d4 input=766bdbd33df251dc]*/
13595
68.5M
{
13596
68.5M
    if (PyTuple_Check(subobj)) {
13597
8.58M
        Py_ssize_t i;
13598
31.1M
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13599
22.6M
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13600
22.6M
            if (!PyUnicode_Check(substring)) {
13601
0
                PyErr_Format(PyExc_TypeError,
13602
0
                             "tuple for startswith must only contain str, "
13603
0
                             "not %.100s",
13604
0
                             Py_TYPE(substring)->tp_name);
13605
0
                return NULL;
13606
0
            }
13607
22.6M
            int result = tailmatch(self, substring, start, end, -1);
13608
22.6M
            if (result < 0) {
13609
0
                return NULL;
13610
0
            }
13611
22.6M
            if (result) {
13612
36.8k
                Py_RETURN_TRUE;
13613
36.8k
            }
13614
22.6M
        }
13615
        /* nothing matched */
13616
8.58M
        Py_RETURN_FALSE;
13617
8.58M
    }
13618
59.9M
    if (!PyUnicode_Check(subobj)) {
13619
0
        PyErr_Format(PyExc_TypeError,
13620
0
                     "startswith first arg must be str or "
13621
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13622
0
        return NULL;
13623
0
    }
13624
59.9M
    int result = tailmatch(self, subobj, start, end, -1);
13625
59.9M
    if (result < 0) {
13626
0
        return NULL;
13627
0
    }
13628
59.9M
    return PyBool_FromLong(result);
13629
59.9M
}
13630
13631
13632
/*[clinic input]
13633
@permit_long_summary
13634
@text_signature "($self, suffix[, start[, end]], /)"
13635
str.endswith as unicode_endswith
13636
13637
    suffix as subobj: object
13638
        A string or a tuple of strings to try.
13639
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13640
        Optional start position. Default: start of the string.
13641
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13642
        Optional stop position. Default: end of the string.
13643
    /
13644
13645
Return True if the string ends with the specified suffix, False otherwise.
13646
[clinic start generated code]*/
13647
13648
static PyObject *
13649
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13650
                      Py_ssize_t end)
13651
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=b66bf6d5547ba1aa]*/
13652
15.4M
{
13653
15.4M
    if (PyTuple_Check(subobj)) {
13654
191k
        Py_ssize_t i;
13655
352k
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13656
332k
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13657
332k
            if (!PyUnicode_Check(substring)) {
13658
0
                PyErr_Format(PyExc_TypeError,
13659
0
                             "tuple for endswith must only contain str, "
13660
0
                             "not %.100s",
13661
0
                             Py_TYPE(substring)->tp_name);
13662
0
                return NULL;
13663
0
            }
13664
332k
            int result = tailmatch(self, substring, start, end, +1);
13665
332k
            if (result < 0) {
13666
0
                return NULL;
13667
0
            }
13668
332k
            if (result) {
13669
171k
                Py_RETURN_TRUE;
13670
171k
            }
13671
332k
        }
13672
191k
        Py_RETURN_FALSE;
13673
191k
    }
13674
15.2M
    if (!PyUnicode_Check(subobj)) {
13675
0
        PyErr_Format(PyExc_TypeError,
13676
0
                     "endswith first arg must be str or "
13677
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13678
0
        return NULL;
13679
0
    }
13680
15.2M
    int result = tailmatch(self, subobj, start, end, +1);
13681
15.2M
    if (result < 0) {
13682
0
        return NULL;
13683
0
    }
13684
15.2M
    return PyBool_FromLong(result);
13685
15.2M
}
13686
13687
13688
static inline void
13689
_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13690
63.5M
{
13691
63.5M
    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13692
63.5M
    writer->data = PyUnicode_DATA(writer->buffer);
13693
13694
63.5M
    if (!writer->readonly) {
13695
63.5M
        writer->kind = PyUnicode_KIND(writer->buffer);
13696
63.5M
        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13697
63.5M
    }
13698
17.8k
    else {
13699
        /* use a value smaller than PyUnicode_1BYTE_KIND() so
13700
           _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13701
17.8k
        writer->kind = 0;
13702
17.8k
        assert(writer->kind <= PyUnicode_1BYTE_KIND);
13703
13704
        /* Copy-on-write mode: set buffer size to 0 so
13705
         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13706
         * next write. */
13707
17.8k
        writer->size = 0;
13708
17.8k
    }
13709
63.5M
}
13710
13711
13712
void
13713
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13714
50.7M
{
13715
50.7M
    memset(writer, 0, sizeof(*writer));
13716
13717
    /* ASCII is the bare minimum */
13718
50.7M
    writer->min_char = 127;
13719
13720
    /* use a kind value smaller than PyUnicode_1BYTE_KIND so
13721
       _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13722
50.7M
    assert(writer->kind == 0);
13723
50.7M
    assert(writer->kind < PyUnicode_1BYTE_KIND);
13724
50.7M
}
13725
13726
13727
PyUnicodeWriter*
13728
PyUnicodeWriter_Create(Py_ssize_t length)
13729
4.22M
{
13730
4.22M
    if (length < 0) {
13731
0
        PyErr_SetString(PyExc_ValueError,
13732
0
                        "length must be positive");
13733
0
        return NULL;
13734
0
    }
13735
13736
4.22M
    const size_t size = sizeof(_PyUnicodeWriter);
13737
4.22M
    PyUnicodeWriter *pub_writer;
13738
4.22M
    pub_writer = _Py_FREELIST_POP_MEM(unicode_writers);
13739
4.22M
    if (pub_writer == NULL) {
13740
2.31M
        pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
13741
2.31M
        if (pub_writer == NULL) {
13742
0
            return (PyUnicodeWriter *)PyErr_NoMemory();
13743
0
        }
13744
2.31M
    }
13745
4.22M
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
13746
13747
4.22M
    _PyUnicodeWriter_Init(writer);
13748
4.22M
    if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
13749
0
        PyUnicodeWriter_Discard(pub_writer);
13750
0
        return NULL;
13751
0
    }
13752
4.22M
    writer->overallocate = 1;
13753
13754
4.22M
    return pub_writer;
13755
4.22M
}
13756
13757
13758
void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
13759
64.0k
{
13760
64.0k
    if (writer == NULL) {
13761
63.5k
        return;
13762
63.5k
    }
13763
501
    _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
13764
501
    _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
13765
501
}
13766
13767
13768
// Initialize _PyUnicodeWriter with initial buffer
13769
static inline void
13770
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13771
496k
{
13772
496k
    memset(writer, 0, sizeof(*writer));
13773
496k
    writer->buffer = buffer;
13774
496k
    _PyUnicodeWriter_Update(writer);
13775
496k
    writer->min_length = writer->size;
13776
496k
}
13777
13778
13779
int
13780
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13781
                                 Py_ssize_t length, Py_UCS4 maxchar)
13782
63.0M
{
13783
63.0M
    Py_ssize_t newlen;
13784
63.0M
    PyObject *newbuffer;
13785
13786
63.0M
    assert(length >= 0);
13787
63.0M
    assert(maxchar <= MAX_UNICODE);
13788
13789
    /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13790
63.0M
    assert((maxchar > writer->maxchar && length >= 0)
13791
63.0M
           || length > 0);
13792
13793
63.0M
    if (length > PY_SSIZE_T_MAX - writer->pos) {
13794
0
        PyErr_NoMemory();
13795
0
        return -1;
13796
0
    }
13797
63.0M
    newlen = writer->pos + length;
13798
13799
63.0M
    maxchar = Py_MAX(maxchar, writer->min_char);
13800
13801
63.0M
    if (writer->buffer == NULL) {
13802
46.4M
        assert(!writer->readonly);
13803
46.4M
        if (writer->overallocate
13804
46.4M
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13805
            /* overallocate to limit the number of realloc() */
13806
36.0M
            newlen += newlen / OVERALLOCATE_FACTOR;
13807
36.0M
        }
13808
46.4M
        if (newlen < writer->min_length)
13809
42.1M
            newlen = writer->min_length;
13810
13811
46.4M
        writer->buffer = PyUnicode_New(newlen, maxchar);
13812
46.4M
        if (writer->buffer == NULL)
13813
0
            return -1;
13814
46.4M
    }
13815
16.6M
    else if (newlen > writer->size) {
13816
13.9M
        if (writer->overallocate
13817
13.9M
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13818
            /* overallocate to limit the number of realloc() */
13819
13.6M
            newlen += newlen / OVERALLOCATE_FACTOR;
13820
13.6M
        }
13821
13.9M
        if (newlen < writer->min_length)
13822
1.13k
            newlen = writer->min_length;
13823
13824
13.9M
        if (maxchar > writer->maxchar || writer->readonly) {
13825
            /* resize + widen */
13826
3.43M
            maxchar = Py_MAX(maxchar, writer->maxchar);
13827
3.43M
            newbuffer = PyUnicode_New(newlen, maxchar);
13828
3.43M
            if (newbuffer == NULL)
13829
0
                return -1;
13830
3.43M
            _PyUnicode_FastCopyCharacters(newbuffer, 0,
13831
3.43M
                                          writer->buffer, 0, writer->pos);
13832
3.43M
            Py_DECREF(writer->buffer);
13833
3.43M
            writer->readonly = 0;
13834
3.43M
        }
13835
10.5M
        else {
13836
10.5M
            newbuffer = resize_compact(writer->buffer, newlen);
13837
10.5M
            if (newbuffer == NULL)
13838
0
                return -1;
13839
10.5M
        }
13840
13.9M
        writer->buffer = newbuffer;
13841
13.9M
    }
13842
2.65M
    else if (maxchar > writer->maxchar) {
13843
2.65M
        assert(!writer->readonly);
13844
2.65M
        newbuffer = PyUnicode_New(writer->size, maxchar);
13845
2.65M
        if (newbuffer == NULL)
13846
0
            return -1;
13847
2.65M
        _PyUnicode_FastCopyCharacters(newbuffer, 0,
13848
2.65M
                                      writer->buffer, 0, writer->pos);
13849
2.65M
        Py_SETREF(writer->buffer, newbuffer);
13850
2.65M
    }
13851
63.0M
    _PyUnicodeWriter_Update(writer);
13852
63.0M
    return 0;
13853
13854
63.0M
#undef OVERALLOCATE_FACTOR
13855
63.0M
}
13856
13857
int
13858
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13859
                                     int kind)
13860
147k
{
13861
147k
    Py_UCS4 maxchar;
13862
13863
    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13864
147k
    assert(writer->kind < kind);
13865
13866
147k
    switch (kind)
13867
147k
    {
13868
0
    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13869
147k
    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13870
0
    case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
13871
0
    default:
13872
0
        Py_UNREACHABLE();
13873
147k
    }
13874
13875
147k
    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13876
147k
}
13877
13878
static inline int
13879
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13880
292M
{
13881
292M
    assert(ch <= MAX_UNICODE);
13882
292M
    if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13883
0
        return -1;
13884
292M
    PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13885
292M
    writer->pos++;
13886
292M
    return 0;
13887
292M
}
13888
13889
int
13890
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13891
100M
{
13892
100M
    return _PyUnicodeWriter_WriteCharInline(writer, ch);
13893
100M
}
13894
13895
int
13896
PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
13897
74.0M
{
13898
74.0M
    if (ch > MAX_UNICODE) {
13899
0
        PyErr_SetString(PyExc_ValueError,
13900
0
                        "character must be in range(0x110000)");
13901
0
        return -1;
13902
0
    }
13903
13904
74.0M
    return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
13905
74.0M
}
13906
13907
int
13908
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13909
61.2M
{
13910
61.2M
    assert(PyUnicode_Check(str));
13911
13912
61.2M
    Py_UCS4 maxchar;
13913
61.2M
    Py_ssize_t len;
13914
13915
61.2M
    len = PyUnicode_GET_LENGTH(str);
13916
61.2M
    if (len == 0)
13917
22.6M
        return 0;
13918
38.5M
    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13919
38.5M
    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13920
21.1M
        if (writer->buffer == NULL && !writer->overallocate) {
13921
9.31k
            assert(_PyUnicode_CheckConsistency(str, 1));
13922
9.31k
            writer->readonly = 1;
13923
9.31k
            writer->buffer = Py_NewRef(str);
13924
9.31k
            _PyUnicodeWriter_Update(writer);
13925
9.31k
            writer->pos += len;
13926
9.31k
            return 0;
13927
9.31k
        }
13928
21.1M
        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13929
0
            return -1;
13930
21.1M
    }
13931
38.5M
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13932
38.5M
                                  str, 0, len);
13933
38.5M
    writer->pos += len;
13934
38.5M
    return 0;
13935
38.5M
}
13936
13937
int
13938
PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
13939
4.18M
{
13940
4.18M
    PyTypeObject *type = Py_TYPE(obj);
13941
4.18M
    if (type == &PyUnicode_Type) {
13942
4.18M
        return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj);
13943
4.18M
    }
13944
13945
0
    if (type == &PyLong_Type) {
13946
0
        return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
13947
0
    }
13948
13949
0
    PyObject *str = PyObject_Str(obj);
13950
0
    if (str == NULL) {
13951
0
        return -1;
13952
0
    }
13953
13954
0
    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
13955
0
    Py_DECREF(str);
13956
0
    return res;
13957
0
}
13958
13959
13960
int
13961
PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj)
13962
7.56M
{
13963
7.56M
    if (Py_TYPE(obj) == &PyLong_Type) {
13964
914k
        return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
13965
914k
    }
13966
13967
6.64M
    PyObject *repr = PyObject_Repr(obj);
13968
6.64M
    if (repr == NULL) {
13969
0
        return -1;
13970
0
    }
13971
13972
6.64M
    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr);
13973
6.64M
    Py_DECREF(repr);
13974
6.64M
    return res;
13975
6.64M
}
13976
13977
13978
int
13979
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13980
                                Py_ssize_t start, Py_ssize_t end)
13981
63.9M
{
13982
63.9M
    assert(0 <= start);
13983
63.9M
    assert(end <= PyUnicode_GET_LENGTH(str));
13984
63.9M
    assert(start <= end);
13985
13986
63.9M
    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13987
142
        return _PyUnicodeWriter_WriteStr(writer, str);
13988
13989
63.9M
    Py_ssize_t len = end - start;
13990
63.9M
    if (len == 0) {
13991
0
        return 0;
13992
0
    }
13993
13994
63.9M
    Py_UCS4 maxchar;
13995
63.9M
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) {
13996
14.0M
        maxchar = _PyUnicode_FindMaxChar(str, start, end);
13997
14.0M
    }
13998
49.9M
    else {
13999
49.9M
        maxchar = writer->maxchar;
14000
49.9M
    }
14001
63.9M
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) {
14002
0
        return -1;
14003
0
    }
14004
14005
63.9M
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14006
63.9M
                                  str, start, len);
14007
63.9M
    writer->pos += len;
14008
63.9M
    return 0;
14009
63.9M
}
14010
14011
14012
int
14013
PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str,
14014
                               Py_ssize_t start, Py_ssize_t end)
14015
578k
{
14016
578k
    if (!PyUnicode_Check(str)) {
14017
0
        PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
14018
0
        return -1;
14019
0
    }
14020
578k
    if (start < 0 || start > end) {
14021
0
        PyErr_Format(PyExc_ValueError, "invalid start argument");
14022
0
        return -1;
14023
0
    }
14024
578k
    if (end > PyUnicode_GET_LENGTH(str)) {
14025
0
        PyErr_Format(PyExc_ValueError, "invalid end argument");
14026
0
        return -1;
14027
0
    }
14028
14029
578k
    return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
14030
578k
                                           start, end);
14031
578k
}
14032
14033
14034
int
14035
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14036
                                  const char *ascii, Py_ssize_t len)
14037
54.4M
{
14038
54.4M
    if (len == -1)
14039
0
        len = strlen(ascii);
14040
14041
54.4M
    assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
14042
14043
54.4M
    if (writer->buffer == NULL && !writer->overallocate) {
14044
8.54k
        PyObject *str;
14045
14046
8.54k
        str = _PyUnicode_FromASCII(ascii, len);
14047
8.54k
        if (str == NULL)
14048
0
            return -1;
14049
14050
8.54k
        writer->readonly = 1;
14051
8.54k
        writer->buffer = str;
14052
8.54k
        _PyUnicodeWriter_Update(writer);
14053
8.54k
        writer->pos += len;
14054
8.54k
        return 0;
14055
8.54k
    }
14056
14057
54.4M
    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14058
0
        return -1;
14059
14060
54.4M
    switch (writer->kind)
14061
54.4M
    {
14062
54.4M
    case PyUnicode_1BYTE_KIND:
14063
54.4M
    {
14064
54.4M
        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14065
54.4M
        Py_UCS1 *data = writer->data;
14066
14067
54.4M
        memcpy(data + writer->pos, str, len);
14068
54.4M
        break;
14069
0
    }
14070
10.6k
    case PyUnicode_2BYTE_KIND:
14071
10.6k
    {
14072
10.6k
        _PyUnicode_CONVERT_BYTES(
14073
10.6k
            Py_UCS1, Py_UCS2,
14074
10.6k
            ascii, ascii + len,
14075
10.6k
            (Py_UCS2 *)writer->data + writer->pos);
14076
10.6k
        break;
14077
0
    }
14078
3.24k
    case PyUnicode_4BYTE_KIND:
14079
3.24k
    {
14080
3.24k
        _PyUnicode_CONVERT_BYTES(
14081
3.24k
            Py_UCS1, Py_UCS4,
14082
3.24k
            ascii, ascii + len,
14083
3.24k
            (Py_UCS4 *)writer->data + writer->pos);
14084
3.24k
        break;
14085
0
    }
14086
0
    default:
14087
0
        Py_UNREACHABLE();
14088
54.4M
    }
14089
14090
54.4M
    writer->pos += len;
14091
54.4M
    return 0;
14092
54.4M
}
14093
14094
14095
int
14096
PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer,
14097
                           const char *str,
14098
                           Py_ssize_t size)
14099
426k
{
14100
426k
    assert(writer != NULL);
14101
426k
    _Py_AssertHoldsTstate();
14102
14103
426k
    _PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer;
14104
426k
    return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size);
14105
426k
}
14106
14107
14108
int
14109
PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
14110
                          const char *str,
14111
                          Py_ssize_t size)
14112
0
{
14113
0
    if (size < 0) {
14114
0
        size = strlen(str);
14115
0
    }
14116
14117
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
14118
0
    Py_ssize_t old_pos = _writer->pos;
14119
0
    int res = unicode_decode_utf8_writer(_writer, str, size,
14120
0
                                         _Py_ERROR_STRICT, NULL, NULL);
14121
0
    if (res < 0) {
14122
0
        _writer->pos = old_pos;
14123
0
    }
14124
0
    return res;
14125
0
}
14126
14127
14128
int
14129
PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
14130
                                   const char *string,
14131
                                   Py_ssize_t length,
14132
                                   const char *errors,
14133
                                   Py_ssize_t *consumed)
14134
0
{
14135
0
    if (length < 0) {
14136
0
        length = strlen(string);
14137
0
    }
14138
14139
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
14140
0
    Py_ssize_t old_pos = _writer->pos;
14141
0
    int res = unicode_decode_utf8_writer(_writer, string, length,
14142
0
                                         _Py_ERROR_UNKNOWN, errors, consumed);
14143
0
    if (res < 0) {
14144
0
        _writer->pos = old_pos;
14145
0
        if (consumed) {
14146
0
            *consumed = 0;
14147
0
        }
14148
0
    }
14149
0
    return res;
14150
0
}
14151
14152
14153
int
14154
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14155
                                   const char *str, Py_ssize_t len)
14156
0
{
14157
0
    Py_UCS4 maxchar;
14158
14159
0
    maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
14160
0
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14161
0
        return -1;
14162
0
    unicode_write_cstr(writer->buffer, writer->pos, str, len);
14163
0
    writer->pos += len;
14164
0
    return 0;
14165
0
}
14166
14167
PyObject *
14168
_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
14169
46.8M
{
14170
46.8M
    PyObject *str;
14171
14172
46.8M
    if (writer->pos == 0) {
14173
877
        Py_CLEAR(writer->buffer);
14174
877
        _Py_RETURN_UNICODE_EMPTY();
14175
877
    }
14176
14177
46.8M
    str = writer->buffer;
14178
46.8M
    writer->buffer = NULL;
14179
14180
46.8M
    if (writer->readonly) {
14181
16.7k
        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14182
16.7k
        return str;
14183
16.7k
    }
14184
14185
46.8M
    if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14186
45.9M
        PyObject *str2;
14187
45.9M
        str2 = resize_compact(str, writer->pos);
14188
45.9M
        if (str2 == NULL) {
14189
0
            Py_DECREF(str);
14190
0
            return NULL;
14191
0
        }
14192
45.9M
        str = str2;
14193
45.9M
    }
14194
14195
46.8M
    assert(_PyUnicode_CheckConsistency(str, 1));
14196
46.8M
    return unicode_result(str);
14197
46.8M
}
14198
14199
14200
PyObject*
14201
PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
14202
4.22M
{
14203
4.22M
    PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer);
14204
4.22M
    assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
14205
4.22M
    _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
14206
4.22M
    return str;
14207
4.22M
}
14208
14209
14210
void
14211
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14212
4.37M
{
14213
4.37M
    Py_CLEAR(writer->buffer);
14214
4.37M
}
14215
14216
#include "stringlib/unicode_format.h"
14217
14218
PyDoc_STRVAR(format__doc__,
14219
             "format($self, /, *args, **kwargs)\n\
14220
--\n\
14221
\n\
14222
Return a formatted version of the string, using substitutions from args and kwargs.\n\
14223
The substitutions are identified by braces ('{' and '}').");
14224
14225
PyDoc_STRVAR(format_map__doc__,
14226
             "format_map($self, mapping, /)\n\
14227
--\n\
14228
\n\
14229
Return a formatted version of the string, using substitutions from mapping.\n\
14230
The substitutions are identified by braces ('{' and '}').");
14231
14232
/*[clinic input]
14233
str.__format__ as unicode___format__
14234
14235
    format_spec: unicode
14236
    /
14237
14238
Return a formatted version of the string as described by format_spec.
14239
[clinic start generated code]*/
14240
14241
static PyObject *
14242
unicode___format___impl(PyObject *self, PyObject *format_spec)
14243
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
14244
0
{
14245
0
    _PyUnicodeWriter writer;
14246
0
    int ret;
14247
14248
0
    _PyUnicodeWriter_Init(&writer);
14249
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
14250
0
                                          self, format_spec, 0,
14251
0
                                          PyUnicode_GET_LENGTH(format_spec));
14252
0
    if (ret == -1) {
14253
0
        _PyUnicodeWriter_Dealloc(&writer);
14254
0
        return NULL;
14255
0
    }
14256
0
    return _PyUnicodeWriter_Finish(&writer);
14257
0
}
14258
14259
/*[clinic input]
14260
str.__sizeof__ as unicode_sizeof
14261
14262
Return the size of the string in memory, in bytes.
14263
[clinic start generated code]*/
14264
14265
static PyObject *
14266
unicode_sizeof_impl(PyObject *self)
14267
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
14268
0
{
14269
0
    Py_ssize_t size;
14270
14271
    /* If it's a compact object, account for base structure +
14272
       character data. */
14273
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
14274
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14275
0
    }
14276
0
    else if (PyUnicode_IS_COMPACT(self)) {
14277
0
        size = sizeof(PyCompactUnicodeObject) +
14278
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
14279
0
    }
14280
0
    else {
14281
        /* If it is a two-block object, account for base object, and
14282
           for character block if present. */
14283
0
        size = sizeof(PyUnicodeObject);
14284
0
        if (_PyUnicode_DATA_ANY(self))
14285
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
14286
0
                PyUnicode_KIND(self);
14287
0
    }
14288
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
14289
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
14290
14291
0
    return PyLong_FromSsize_t(size);
14292
0
}
14293
14294
static PyObject *
14295
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
14296
0
{
14297
0
    PyObject *copy = _PyUnicode_Copy(v);
14298
0
    if (!copy)
14299
0
        return NULL;
14300
0
    return Py_BuildValue("(N)", copy);
14301
0
}
14302
14303
/*
14304
This function searchs the longest common leading whitespace
14305
of all lines in the [src, end).
14306
It returns the length of the common leading whitespace and sets `output` to
14307
point to the beginning of the common leading whitespace if length > 0.
14308
*/
14309
static Py_ssize_t
14310
search_longest_common_leading_whitespace(
14311
    const char *const src,
14312
    const char *const end,
14313
    const char **output)
14314
0
{
14315
    // [_start, _start + _len)
14316
    // describes the current longest common leading whitespace
14317
0
    const char *_start = NULL;
14318
0
    Py_ssize_t _len = 0;
14319
14320
0
    for (const char *iter = src; iter < end; ++iter) {
14321
0
        const char *line_start = iter;
14322
0
        const char *leading_whitespace_end = NULL;
14323
14324
        // scan the whole line
14325
0
        while (iter < end && *iter != '\n') {
14326
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
14327
                /* `iter` points to the first non-whitespace character
14328
                   in this line */
14329
0
                if (iter == line_start) {
14330
                    // some line has no indent, fast exit!
14331
0
                    return 0;
14332
0
                }
14333
0
                leading_whitespace_end = iter;
14334
0
            }
14335
0
            ++iter;
14336
0
        }
14337
14338
        // if this line has all white space, skip it
14339
0
        if (!leading_whitespace_end) {
14340
0
            continue;
14341
0
        }
14342
14343
0
        if (!_start) {
14344
            // update the first leading whitespace
14345
0
            _start = line_start;
14346
0
            _len = leading_whitespace_end - line_start;
14347
0
            assert(_len > 0);
14348
0
        }
14349
0
        else {
14350
            /* We then compare with the current longest leading whitespace.
14351
14352
               [line_start, leading_whitespace_end) is the leading
14353
               whitespace of this line,
14354
14355
               [_start, _start + _len) is the leading whitespace of the
14356
               current longest leading whitespace. */
14357
0
            Py_ssize_t new_len = 0;
14358
0
            const char *_iter = _start, *line_iter = line_start;
14359
14360
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
14361
0
                   && *_iter == *line_iter)
14362
0
            {
14363
0
                ++_iter;
14364
0
                ++line_iter;
14365
0
                ++new_len;
14366
0
            }
14367
14368
0
            _len = new_len;
14369
0
            if (_len == 0) {
14370
                // No common things now, fast exit!
14371
0
                return 0;
14372
0
            }
14373
0
        }
14374
0
    }
14375
14376
0
    assert(_len >= 0);
14377
0
    if (_len > 0) {
14378
0
        *output = _start;
14379
0
    }
14380
0
    return _len;
14381
0
}
14382
14383
/* Dedent a string.
14384
   Behaviour is expected to be an exact match of `textwrap.dedent`.
14385
   Return a new reference on success, NULL with exception set on error.
14386
   */
14387
PyObject *
14388
_PyUnicode_Dedent(PyObject *unicode)
14389
0
{
14390
0
    Py_ssize_t src_len = 0;
14391
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
14392
0
    if (!src) {
14393
0
        return NULL;
14394
0
    }
14395
0
    assert(src_len >= 0);
14396
0
    if (src_len == 0) {
14397
0
        return Py_NewRef(unicode);
14398
0
    }
14399
14400
0
    const char *const end = src + src_len;
14401
14402
    // [whitespace_start, whitespace_start + whitespace_len)
14403
    // describes the current longest common leading whitespace
14404
0
    const char *whitespace_start = NULL;
14405
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
14406
0
        src, end, &whitespace_start);
14407
14408
0
    if (whitespace_len == 0) {
14409
0
        return Py_NewRef(unicode);
14410
0
    }
14411
14412
    // now we should trigger a dedent
14413
0
    char *dest = PyMem_Malloc(src_len);
14414
0
    if (!dest) {
14415
0
        PyErr_NoMemory();
14416
0
        return NULL;
14417
0
    }
14418
0
    char *dest_iter = dest;
14419
14420
0
    for (const char *iter = src; iter < end; ++iter) {
14421
0
        const char *line_start = iter;
14422
0
        bool in_leading_space = true;
14423
14424
        // iterate over a line to find the end of a line
14425
0
        while (iter < end && *iter != '\n') {
14426
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
14427
0
                in_leading_space = false;
14428
0
            }
14429
0
            ++iter;
14430
0
        }
14431
14432
        // invariant: *iter == '\n' or iter == end
14433
0
        bool append_newline = iter < end;
14434
14435
        // if this line has all white space, write '\n' and continue
14436
0
        if (in_leading_space && append_newline) {
14437
0
            *dest_iter++ = '\n';
14438
0
            continue;
14439
0
        }
14440
14441
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
14442
            conditionally append '\n' */
14443
14444
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
14445
0
        assert(new_line_len >= 0);
14446
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
14447
14448
0
        dest_iter += new_line_len;
14449
14450
0
        if (append_newline) {
14451
0
            *dest_iter++ = '\n';
14452
0
        }
14453
0
    }
14454
14455
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
14456
0
    PyMem_Free(dest);
14457
0
    return res;
14458
0
}
14459
14460
static PyMethodDef unicode_methods[] = {
14461
    UNICODE_ENCODE_METHODDEF
14462
    UNICODE_REPLACE_METHODDEF
14463
    UNICODE_SPLIT_METHODDEF
14464
    UNICODE_RSPLIT_METHODDEF
14465
    UNICODE_JOIN_METHODDEF
14466
    UNICODE_CAPITALIZE_METHODDEF
14467
    UNICODE_CASEFOLD_METHODDEF
14468
    UNICODE_TITLE_METHODDEF
14469
    UNICODE_CENTER_METHODDEF
14470
    UNICODE_COUNT_METHODDEF
14471
    UNICODE_EXPANDTABS_METHODDEF
14472
    UNICODE_FIND_METHODDEF
14473
    UNICODE_PARTITION_METHODDEF
14474
    UNICODE_INDEX_METHODDEF
14475
    UNICODE_LJUST_METHODDEF
14476
    UNICODE_LOWER_METHODDEF
14477
    UNICODE_LSTRIP_METHODDEF
14478
    UNICODE_RFIND_METHODDEF
14479
    UNICODE_RINDEX_METHODDEF
14480
    UNICODE_RJUST_METHODDEF
14481
    UNICODE_RSTRIP_METHODDEF
14482
    UNICODE_RPARTITION_METHODDEF
14483
    UNICODE_SPLITLINES_METHODDEF
14484
    UNICODE_STRIP_METHODDEF
14485
    UNICODE_SWAPCASE_METHODDEF
14486
    UNICODE_TRANSLATE_METHODDEF
14487
    UNICODE_UPPER_METHODDEF
14488
    UNICODE_STARTSWITH_METHODDEF
14489
    UNICODE_ENDSWITH_METHODDEF
14490
    UNICODE_REMOVEPREFIX_METHODDEF
14491
    UNICODE_REMOVESUFFIX_METHODDEF
14492
    UNICODE_ISASCII_METHODDEF
14493
    UNICODE_ISLOWER_METHODDEF
14494
    UNICODE_ISUPPER_METHODDEF
14495
    UNICODE_ISTITLE_METHODDEF
14496
    UNICODE_ISSPACE_METHODDEF
14497
    UNICODE_ISDECIMAL_METHODDEF
14498
    UNICODE_ISDIGIT_METHODDEF
14499
    UNICODE_ISNUMERIC_METHODDEF
14500
    UNICODE_ISALPHA_METHODDEF
14501
    UNICODE_ISALNUM_METHODDEF
14502
    UNICODE_ISIDENTIFIER_METHODDEF
14503
    UNICODE_ISPRINTABLE_METHODDEF
14504
    UNICODE_ZFILL_METHODDEF
14505
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
14506
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
14507
    UNICODE___FORMAT___METHODDEF
14508
    UNICODE_MAKETRANS_METHODDEF
14509
    UNICODE_SIZEOF_METHODDEF
14510
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
14511
    {NULL, NULL}
14512
};
14513
14514
static PyObject *
14515
unicode_mod(PyObject *v, PyObject *w)
14516
23.6M
{
14517
23.6M
    if (!PyUnicode_Check(v))
14518
0
        Py_RETURN_NOTIMPLEMENTED;
14519
23.6M
    return PyUnicode_Format(v, w);
14520
23.6M
}
14521
14522
static PyNumberMethods unicode_as_number = {
14523
    0,              /*nb_add*/
14524
    0,              /*nb_subtract*/
14525
    0,              /*nb_multiply*/
14526
    unicode_mod,            /*nb_remainder*/
14527
};
14528
14529
static PySequenceMethods unicode_as_sequence = {
14530
    unicode_length,     /* sq_length */
14531
    PyUnicode_Concat,   /* sq_concat */
14532
    unicode_repeat,     /* sq_repeat */
14533
    unicode_getitem,    /* sq_item */
14534
    0,                  /* sq_slice */
14535
    0,                  /* sq_ass_item */
14536
    0,                  /* sq_ass_slice */
14537
    PyUnicode_Contains, /* sq_contains */
14538
};
14539
14540
static PyObject*
14541
unicode_subscript(PyObject* self, PyObject* item)
14542
140M
{
14543
140M
    if (_PyIndex_Check(item)) {
14544
51.9M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14545
51.9M
        if (i == -1 && PyErr_Occurred())
14546
0
            return NULL;
14547
51.9M
        if (i < 0)
14548
56.7k
            i += PyUnicode_GET_LENGTH(self);
14549
51.9M
        return unicode_getitem(self, i);
14550
88.1M
    } else if (PySlice_Check(item)) {
14551
88.1M
        Py_ssize_t start, stop, step, slicelength, i;
14552
88.1M
        size_t cur;
14553
88.1M
        PyObject *result;
14554
88.1M
        const void *src_data;
14555
88.1M
        void *dest_data;
14556
88.1M
        int src_kind, dest_kind;
14557
88.1M
        Py_UCS4 ch, max_char, kind_limit;
14558
14559
88.1M
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14560
0
            return NULL;
14561
0
        }
14562
88.1M
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14563
88.1M
                                            &start, &stop, step);
14564
14565
88.1M
        if (slicelength <= 0) {
14566
15.1M
            _Py_RETURN_UNICODE_EMPTY();
14567
72.9M
        } else if (start == 0 && step == 1 &&
14568
72.9M
                   slicelength == PyUnicode_GET_LENGTH(self)) {
14569
6.98M
            return unicode_result_unchanged(self);
14570
65.9M
        } else if (step == 1) {
14571
65.9M
            return PyUnicode_Substring(self,
14572
65.9M
                                       start, start + slicelength);
14573
65.9M
        }
14574
        /* General case */
14575
0
        src_kind = PyUnicode_KIND(self);
14576
0
        src_data = PyUnicode_DATA(self);
14577
0
        if (!PyUnicode_IS_ASCII(self)) {
14578
0
            kind_limit = kind_maxchar_limit(src_kind);
14579
0
            max_char = 0;
14580
0
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14581
0
                ch = PyUnicode_READ(src_kind, src_data, cur);
14582
0
                if (ch > max_char) {
14583
0
                    max_char = ch;
14584
0
                    if (max_char >= kind_limit)
14585
0
                        break;
14586
0
                }
14587
0
            }
14588
0
        }
14589
0
        else
14590
0
            max_char = 127;
14591
0
        result = PyUnicode_New(slicelength, max_char);
14592
0
        if (result == NULL)
14593
0
            return NULL;
14594
0
        dest_kind = PyUnicode_KIND(result);
14595
0
        dest_data = PyUnicode_DATA(result);
14596
14597
0
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14598
0
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14599
0
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14600
0
        }
14601
0
        assert(_PyUnicode_CheckConsistency(result, 1));
14602
0
        return result;
14603
0
    } else {
14604
0
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
14605
0
                     Py_TYPE(item)->tp_name);
14606
0
        return NULL;
14607
0
    }
14608
140M
}
14609
14610
static PyMappingMethods unicode_as_mapping = {
14611
    unicode_length,     /* mp_length */
14612
    unicode_subscript,  /* mp_subscript */
14613
    0,                  /* mp_ass_subscript */
14614
};
14615
14616
14617
/* Helpers for PyUnicode_Format() */
14618
14619
struct unicode_formatter_t {
14620
    PyObject *args;
14621
    int args_owned;
14622
    Py_ssize_t arglen, argidx;
14623
    PyObject *dict;
14624
14625
    int fmtkind;
14626
    Py_ssize_t fmtcnt, fmtpos;
14627
    const void *fmtdata;
14628
    PyObject *fmtstr;
14629
14630
    _PyUnicodeWriter writer;
14631
};
14632
14633
struct unicode_format_arg_t {
14634
    Py_UCS4 ch;
14635
    int flags;
14636
    Py_ssize_t width;
14637
    int prec;
14638
    int sign;
14639
};
14640
14641
static PyObject *
14642
unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14643
45.4M
{
14644
45.4M
    Py_ssize_t argidx = ctx->argidx;
14645
14646
45.4M
    if (argidx < ctx->arglen) {
14647
45.4M
        ctx->argidx++;
14648
45.4M
        if (ctx->arglen < 0)
14649
18.1M
            return ctx->args;
14650
27.3M
        else
14651
27.3M
            return PyTuple_GetItem(ctx->args, argidx);
14652
45.4M
    }
14653
0
    PyErr_SetString(PyExc_TypeError,
14654
0
                    "not enough arguments for format string");
14655
0
    return NULL;
14656
45.4M
}
14657
14658
/* Returns a new reference to a PyUnicode object, or NULL on failure. */
14659
14660
/* Format a float into the writer if the writer is not NULL, or into *p_output
14661
   otherwise.
14662
14663
   Return 0 on success, raise an exception and return -1 on error. */
14664
static int
14665
formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14666
            PyObject **p_output,
14667
            _PyUnicodeWriter *writer)
14668
0
{
14669
0
    char *p;
14670
0
    double x;
14671
0
    Py_ssize_t len;
14672
0
    int prec;
14673
0
    int dtoa_flags = 0;
14674
14675
0
    x = PyFloat_AsDouble(v);
14676
0
    if (x == -1.0 && PyErr_Occurred())
14677
0
        return -1;
14678
14679
0
    prec = arg->prec;
14680
0
    if (prec < 0)
14681
0
        prec = 6;
14682
14683
0
    if (arg->flags & F_ALT)
14684
0
        dtoa_flags |= Py_DTSF_ALT;
14685
0
    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14686
0
    if (p == NULL)
14687
0
        return -1;
14688
0
    len = strlen(p);
14689
0
    if (writer) {
14690
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14691
0
            PyMem_Free(p);
14692
0
            return -1;
14693
0
        }
14694
0
    }
14695
0
    else
14696
0
        *p_output = _PyUnicode_FromASCII(p, len);
14697
0
    PyMem_Free(p);
14698
0
    return 0;
14699
0
}
14700
14701
/* formatlong() emulates the format codes d, u, o, x and X, and
14702
 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14703
 * Python's regular ints.
14704
 * Return value:  a new PyUnicodeObject*, or NULL if error.
14705
 *     The output string is of the form
14706
 *         "-"? ("0x" | "0X")? digit+
14707
 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14708
 *         set in flags.  The case of hex digits will be correct,
14709
 *     There will be at least prec digits, zero-filled on the left if
14710
 *         necessary to get that many.
14711
 * val          object to be converted
14712
 * flags        bitmask of format flags; only F_ALT is looked at
14713
 * prec         minimum number of digits; 0-fill on left if needed
14714
 * type         a character in [duoxX]; u acts the same as d
14715
 *
14716
 * CAUTION:  o, x and X conversions on regular ints can never
14717
 * produce a '-' sign, but can for Python's unbounded ints.
14718
 */
14719
PyObject *
14720
_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14721
1.53k
{
14722
1.53k
    PyObject *result = NULL;
14723
1.53k
    char *buf;
14724
1.53k
    Py_ssize_t i;
14725
1.53k
    int sign;           /* 1 if '-', else 0 */
14726
1.53k
    int len;            /* number of characters */
14727
1.53k
    Py_ssize_t llen;
14728
1.53k
    int numdigits;      /* len == numnondigits + numdigits */
14729
1.53k
    int numnondigits = 0;
14730
14731
    /* Avoid exceeding SSIZE_T_MAX */
14732
1.53k
    if (prec > INT_MAX-3) {
14733
0
        PyErr_SetString(PyExc_OverflowError,
14734
0
                        "precision too large");
14735
0
        return NULL;
14736
0
    }
14737
14738
1.53k
    assert(PyLong_Check(val));
14739
14740
1.53k
    switch (type) {
14741
0
    default:
14742
0
        Py_UNREACHABLE();
14743
0
    case 'd':
14744
0
    case 'i':
14745
0
    case 'u':
14746
        /* int and int subclasses should print numerically when a numeric */
14747
        /* format code is used (see issue18780) */
14748
0
        result = PyNumber_ToBase(val, 10);
14749
0
        break;
14750
0
    case 'o':
14751
0
        numnondigits = 2;
14752
0
        result = PyNumber_ToBase(val, 8);
14753
0
        break;
14754
0
    case 'x':
14755
1.53k
    case 'X':
14756
1.53k
        numnondigits = 2;
14757
1.53k
        result = PyNumber_ToBase(val, 16);
14758
1.53k
        break;
14759
1.53k
    }
14760
1.53k
    if (!result)
14761
0
        return NULL;
14762
14763
1.53k
    assert(unicode_modifiable(result));
14764
1.53k
    assert(PyUnicode_IS_ASCII(result));
14765
14766
    /* To modify the string in-place, there can only be one reference. */
14767
1.53k
    if (!_PyObject_IsUniquelyReferenced(result)) {
14768
0
        Py_DECREF(result);
14769
0
        PyErr_BadInternalCall();
14770
0
        return NULL;
14771
0
    }
14772
1.53k
    buf = PyUnicode_DATA(result);
14773
1.53k
    llen = PyUnicode_GET_LENGTH(result);
14774
1.53k
    if (llen > INT_MAX) {
14775
0
        Py_DECREF(result);
14776
0
        PyErr_SetString(PyExc_ValueError,
14777
0
                        "string too large in _PyUnicode_FormatLong");
14778
0
        return NULL;
14779
0
    }
14780
1.53k
    len = (int)llen;
14781
1.53k
    sign = buf[0] == '-';
14782
1.53k
    numnondigits += sign;
14783
1.53k
    numdigits = len - numnondigits;
14784
1.53k
    assert(numdigits > 0);
14785
14786
    /* Get rid of base marker unless F_ALT */
14787
1.53k
    if (((alt) == 0 &&
14788
1.53k
        (type == 'o' || type == 'x' || type == 'X'))) {
14789
1.53k
        assert(buf[sign] == '0');
14790
1.53k
        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14791
1.53k
               buf[sign+1] == 'o');
14792
1.53k
        numnondigits -= 2;
14793
1.53k
        buf += 2;
14794
1.53k
        len -= 2;
14795
1.53k
        if (sign)
14796
0
            buf[0] = '-';
14797
1.53k
        assert(len == numnondigits + numdigits);
14798
1.53k
        assert(numdigits > 0);
14799
1.53k
    }
14800
14801
    /* Fill with leading zeroes to meet minimum width. */
14802
1.53k
    if (prec > numdigits) {
14803
0
        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14804
0
                                numnondigits + prec);
14805
0
        char *b1;
14806
0
        if (!r1) {
14807
0
            Py_DECREF(result);
14808
0
            return NULL;
14809
0
        }
14810
0
        b1 = PyBytes_AS_STRING(r1);
14811
0
        for (i = 0; i < numnondigits; ++i)
14812
0
            *b1++ = *buf++;
14813
0
        for (i = 0; i < prec - numdigits; i++)
14814
0
            *b1++ = '0';
14815
0
        for (i = 0; i < numdigits; i++)
14816
0
            *b1++ = *buf++;
14817
0
        *b1 = '\0';
14818
0
        Py_SETREF(result, r1);
14819
0
        buf = PyBytes_AS_STRING(result);
14820
0
        len = numnondigits + prec;
14821
0
    }
14822
14823
    /* Fix up case for hex conversions. */
14824
1.53k
    if (type == 'X') {
14825
        /* Need to convert all lower case letters to upper case.
14826
           and need to convert 0x to 0X (and -0x to -0X). */
14827
4.51k
        for (i = 0; i < len; i++)
14828
2.97k
            if (buf[i] >= 'a' && buf[i] <= 'x')
14829
1.15k
                buf[i] -= 'a'-'A';
14830
1.53k
    }
14831
1.53k
    if (!PyUnicode_Check(result)
14832
1.53k
        || buf != PyUnicode_DATA(result)) {
14833
1.53k
        PyObject *unicode;
14834
1.53k
        unicode = _PyUnicode_FromASCII(buf, len);
14835
1.53k
        Py_SETREF(result, unicode);
14836
1.53k
    }
14837
0
    else if (len != PyUnicode_GET_LENGTH(result)) {
14838
0
        if (PyUnicode_Resize(&result, len) < 0)
14839
0
            Py_CLEAR(result);
14840
0
    }
14841
1.53k
    return result;
14842
1.53k
}
14843
14844
/* Format an integer or a float as an integer.
14845
 * Return 1 if the number has been formatted into the writer,
14846
 *        0 if the number has been formatted into *p_output
14847
 *       -1 and raise an exception on error */
14848
static int
14849
mainformatlong(PyObject *v,
14850
               struct unicode_format_arg_t *arg,
14851
               PyObject **p_output,
14852
               _PyUnicodeWriter *writer)
14853
10.9M
{
14854
10.9M
    PyObject *iobj, *res;
14855
10.9M
    char type = (char)arg->ch;
14856
14857
10.9M
    if (!PyNumber_Check(v))
14858
4.33M
        goto wrongtype;
14859
14860
    /* make sure number is a type of integer for o, x, and X */
14861
6.64M
    if (!PyLong_Check(v)) {
14862
0
        if (type == 'o' || type == 'x' || type == 'X') {
14863
0
            iobj = _PyNumber_Index(v);
14864
0
        }
14865
0
        else {
14866
0
            iobj = PyNumber_Long(v);
14867
0
        }
14868
0
        if (iobj == NULL ) {
14869
0
            if (PyErr_ExceptionMatches(PyExc_TypeError))
14870
0
                goto wrongtype;
14871
0
            return -1;
14872
0
        }
14873
0
        assert(PyLong_Check(iobj));
14874
0
    }
14875
6.64M
    else {
14876
6.64M
        iobj = Py_NewRef(v);
14877
6.64M
    }
14878
14879
6.64M
    if (PyLong_CheckExact(v)
14880
6.64M
        && arg->width == -1 && arg->prec == -1
14881
6.64M
        && !(arg->flags & (F_SIGN | F_BLANK))
14882
6.64M
        && type != 'X')
14883
6.64M
    {
14884
        /* Fast path */
14885
6.64M
        int alternate = arg->flags & F_ALT;
14886
6.64M
        int base;
14887
14888
6.64M
        switch(type)
14889
6.64M
        {
14890
0
            default:
14891
0
                Py_UNREACHABLE();
14892
6.64M
            case 'd':
14893
6.64M
            case 'i':
14894
6.64M
            case 'u':
14895
6.64M
                base = 10;
14896
6.64M
                break;
14897
0
            case 'o':
14898
0
                base = 8;
14899
0
                break;
14900
0
            case 'x':
14901
0
            case 'X':
14902
0
                base = 16;
14903
0
                break;
14904
6.64M
        }
14905
14906
6.64M
        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14907
0
            Py_DECREF(iobj);
14908
0
            return -1;
14909
0
        }
14910
6.64M
        Py_DECREF(iobj);
14911
6.64M
        return 1;
14912
6.64M
    }
14913
14914
1.53k
    res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14915
1.53k
    Py_DECREF(iobj);
14916
1.53k
    if (res == NULL)
14917
0
        return -1;
14918
1.53k
    *p_output = res;
14919
1.53k
    return 0;
14920
14921
4.33M
wrongtype:
14922
4.33M
    switch(type)
14923
4.33M
    {
14924
0
        case 'o':
14925
0
        case 'x':
14926
0
        case 'X':
14927
0
            PyErr_Format(PyExc_TypeError,
14928
0
                    "%%%c format: an integer is required, "
14929
0
                    "not %.200s",
14930
0
                    type, Py_TYPE(v)->tp_name);
14931
0
            break;
14932
4.33M
        default:
14933
4.33M
            PyErr_Format(PyExc_TypeError,
14934
4.33M
                    "%%%c format: a real number is required, "
14935
4.33M
                    "not %.200s",
14936
4.33M
                    type, Py_TYPE(v)->tp_name);
14937
4.33M
            break;
14938
4.33M
    }
14939
4.33M
    return -1;
14940
4.33M
}
14941
14942
static Py_UCS4
14943
formatchar(PyObject *v)
14944
0
{
14945
    /* presume that the buffer is at least 3 characters long */
14946
0
    if (PyUnicode_Check(v)) {
14947
0
        if (PyUnicode_GET_LENGTH(v) == 1) {
14948
0
            return PyUnicode_READ_CHAR(v, 0);
14949
0
        }
14950
0
        PyErr_Format(PyExc_TypeError,
14951
0
                     "%%c requires an int or a unicode character, "
14952
0
                     "not a string of length %zd",
14953
0
                     PyUnicode_GET_LENGTH(v));
14954
0
        return (Py_UCS4) -1;
14955
0
    }
14956
0
    else {
14957
0
        int overflow;
14958
0
        long x = PyLong_AsLongAndOverflow(v, &overflow);
14959
0
        if (x == -1 && PyErr_Occurred()) {
14960
0
            if (PyErr_ExceptionMatches(PyExc_TypeError)) {
14961
0
                PyErr_Format(PyExc_TypeError,
14962
0
                             "%%c requires an int or a unicode character, not %T",
14963
0
                             v);
14964
0
                return (Py_UCS4) -1;
14965
0
            }
14966
0
            return (Py_UCS4) -1;
14967
0
        }
14968
14969
0
        if (x < 0 || x > MAX_UNICODE) {
14970
            /* this includes an overflow in converting to C long */
14971
0
            PyErr_SetString(PyExc_OverflowError,
14972
0
                            "%c arg not in range(0x110000)");
14973
0
            return (Py_UCS4) -1;
14974
0
        }
14975
14976
0
        return (Py_UCS4) x;
14977
0
    }
14978
0
}
14979
14980
/* Parse options of an argument: flags, width, precision.
14981
   Handle also "%(name)" syntax.
14982
14983
   Return 0 if the argument has been formatted into arg->str.
14984
   Return 1 if the argument has been written into ctx->writer,
14985
   Raise an exception and return -1 on error. */
14986
static int
14987
unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14988
                         struct unicode_format_arg_t *arg)
14989
45.4M
{
14990
45.4M
#define FORMAT_READ(ctx) \
14991
45.7M
        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14992
14993
45.4M
    PyObject *v;
14994
14995
45.4M
    if (arg->ch == '(') {
14996
        /* Get argument value from a dictionary. Example: "%(name)s". */
14997
38.3k
        Py_ssize_t keystart;
14998
38.3k
        Py_ssize_t keylen;
14999
38.3k
        PyObject *key;
15000
38.3k
        int pcount = 1;
15001
15002
38.3k
        if (ctx->dict == NULL) {
15003
0
            PyErr_SetString(PyExc_TypeError,
15004
0
                            "format requires a mapping");
15005
0
            return -1;
15006
0
        }
15007
38.3k
        ++ctx->fmtpos;
15008
38.3k
        --ctx->fmtcnt;
15009
38.3k
        keystart = ctx->fmtpos;
15010
        /* Skip over balanced parentheses */
15011
344k
        while (pcount > 0 && --ctx->fmtcnt >= 0) {
15012
306k
            arg->ch = FORMAT_READ(ctx);
15013
306k
            if (arg->ch == ')')
15014
38.3k
                --pcount;
15015
268k
            else if (arg->ch == '(')
15016
0
                ++pcount;
15017
306k
            ctx->fmtpos++;
15018
306k
        }
15019
38.3k
        keylen = ctx->fmtpos - keystart - 1;
15020
38.3k
        if (ctx->fmtcnt < 0 || pcount > 0) {
15021
0
            PyErr_SetString(PyExc_ValueError,
15022
0
                            "incomplete format key");
15023
0
            return -1;
15024
0
        }
15025
38.3k
        key = PyUnicode_Substring(ctx->fmtstr,
15026
38.3k
                                  keystart, keystart + keylen);
15027
38.3k
        if (key == NULL)
15028
0
            return -1;
15029
38.3k
        if (ctx->args_owned) {
15030
27.3k
            ctx->args_owned = 0;
15031
27.3k
            Py_DECREF(ctx->args);
15032
27.3k
        }
15033
38.3k
        ctx->args = PyObject_GetItem(ctx->dict, key);
15034
38.3k
        Py_DECREF(key);
15035
38.3k
        if (ctx->args == NULL)
15036
0
            return -1;
15037
38.3k
        ctx->args_owned = 1;
15038
38.3k
        ctx->arglen = -1;
15039
38.3k
        ctx->argidx = -2;
15040
38.3k
    }
15041
15042
    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
15043
45.4M
    while (--ctx->fmtcnt >= 0) {
15044
45.4M
        arg->ch = FORMAT_READ(ctx);
15045
45.4M
        ctx->fmtpos++;
15046
45.4M
        switch (arg->ch) {
15047
0
        case '-': arg->flags |= F_LJUST; continue;
15048
0
        case '+': arg->flags |= F_SIGN; continue;
15049
0
        case ' ': arg->flags |= F_BLANK; continue;
15050
0
        case '#': arg->flags |= F_ALT; continue;
15051
1.53k
        case '0': arg->flags |= F_ZERO; continue;
15052
45.4M
        }
15053
45.4M
        break;
15054
45.4M
    }
15055
15056
    /* Parse width. Example: "%10s" => width=10 */
15057
45.4M
    if (arg->ch == '*') {
15058
0
        v = unicode_format_getnextarg(ctx);
15059
0
        if (v == NULL)
15060
0
            return -1;
15061
0
        if (!PyLong_Check(v)) {
15062
0
            PyErr_SetString(PyExc_TypeError,
15063
0
                            "* wants int");
15064
0
            return -1;
15065
0
        }
15066
0
        arg->width = PyLong_AsSsize_t(v);
15067
0
        if (arg->width == -1 && PyErr_Occurred())
15068
0
            return -1;
15069
0
        if (arg->width < 0) {
15070
0
            arg->flags |= F_LJUST;
15071
0
            arg->width = -arg->width;
15072
0
        }
15073
0
        if (--ctx->fmtcnt >= 0) {
15074
0
            arg->ch = FORMAT_READ(ctx);
15075
0
            ctx->fmtpos++;
15076
0
        }
15077
0
    }
15078
45.4M
    else if (arg->ch >= '0' && arg->ch <= '9') {
15079
1.53k
        arg->width = arg->ch - '0';
15080
1.53k
        while (--ctx->fmtcnt >= 0) {
15081
1.53k
            arg->ch = FORMAT_READ(ctx);
15082
1.53k
            ctx->fmtpos++;
15083
1.53k
            if (arg->ch < '0' || arg->ch > '9')
15084
1.53k
                break;
15085
            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
15086
               mixing signed and unsigned comparison. Since arg->ch is between
15087
               '0' and '9', casting to int is safe. */
15088
0
            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
15089
0
                PyErr_SetString(PyExc_ValueError,
15090
0
                                "width too big");
15091
0
                return -1;
15092
0
            }
15093
0
            arg->width = arg->width*10 + (arg->ch - '0');
15094
0
        }
15095
1.53k
    }
15096
15097
    /* Parse precision. Example: "%.3f" => prec=3 */
15098
45.4M
    if (arg->ch == '.') {
15099
0
        arg->prec = 0;
15100
0
        if (--ctx->fmtcnt >= 0) {
15101
0
            arg->ch = FORMAT_READ(ctx);
15102
0
            ctx->fmtpos++;
15103
0
        }
15104
0
        if (arg->ch == '*') {
15105
0
            v = unicode_format_getnextarg(ctx);
15106
0
            if (v == NULL)
15107
0
                return -1;
15108
0
            if (!PyLong_Check(v)) {
15109
0
                PyErr_SetString(PyExc_TypeError,
15110
0
                                "* wants int");
15111
0
                return -1;
15112
0
            }
15113
0
            arg->prec = PyLong_AsInt(v);
15114
0
            if (arg->prec == -1 && PyErr_Occurred())
15115
0
                return -1;
15116
0
            if (arg->prec < 0)
15117
0
                arg->prec = 0;
15118
0
            if (--ctx->fmtcnt >= 0) {
15119
0
                arg->ch = FORMAT_READ(ctx);
15120
0
                ctx->fmtpos++;
15121
0
            }
15122
0
        }
15123
0
        else if (arg->ch >= '0' && arg->ch <= '9') {
15124
0
            arg->prec = arg->ch - '0';
15125
0
            while (--ctx->fmtcnt >= 0) {
15126
0
                arg->ch = FORMAT_READ(ctx);
15127
0
                ctx->fmtpos++;
15128
0
                if (arg->ch < '0' || arg->ch > '9')
15129
0
                    break;
15130
0
                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15131
0
                    PyErr_SetString(PyExc_ValueError,
15132
0
                                    "precision too big");
15133
0
                    return -1;
15134
0
                }
15135
0
                arg->prec = arg->prec*10 + (arg->ch - '0');
15136
0
            }
15137
0
        }
15138
0
    }
15139
15140
    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15141
45.4M
    if (ctx->fmtcnt >= 0) {
15142
45.4M
        if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15143
0
            if (--ctx->fmtcnt >= 0) {
15144
0
                arg->ch = FORMAT_READ(ctx);
15145
0
                ctx->fmtpos++;
15146
0
            }
15147
0
        }
15148
45.4M
    }
15149
45.4M
    if (ctx->fmtcnt < 0) {
15150
0
        PyErr_SetString(PyExc_ValueError,
15151
0
                        "incomplete format");
15152
0
        return -1;
15153
0
    }
15154
45.4M
    return 0;
15155
15156
45.4M
#undef FORMAT_READ
15157
45.4M
}
15158
15159
/* Format one argument. Supported conversion specifiers:
15160
15161
   - "s", "r", "a": any type
15162
   - "i", "d", "u": int or float
15163
   - "o", "x", "X": int
15164
   - "e", "E", "f", "F", "g", "G": float
15165
   - "c": int or str (1 character)
15166
15167
   When possible, the output is written directly into the Unicode writer
15168
   (ctx->writer). A string is created when padding is required.
15169
15170
   Return 0 if the argument has been formatted into *p_str,
15171
          1 if the argument has been written into ctx->writer,
15172
         -1 on error. */
15173
static int
15174
unicode_format_arg_format(struct unicode_formatter_t *ctx,
15175
                          struct unicode_format_arg_t *arg,
15176
                          PyObject **p_str)
15177
45.4M
{
15178
45.4M
    PyObject *v;
15179
45.4M
    _PyUnicodeWriter *writer = &ctx->writer;
15180
15181
45.4M
    if (ctx->fmtcnt == 0)
15182
10.8M
        ctx->writer.overallocate = 0;
15183
15184
45.4M
    v = unicode_format_getnextarg(ctx);
15185
45.4M
    if (v == NULL)
15186
0
        return -1;
15187
15188
15189
45.4M
    switch (arg->ch) {
15190
34.5M
    case 's':
15191
34.5M
    case 'r':
15192
34.5M
    case 'a':
15193
34.5M
        if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15194
            /* Fast path */
15195
0
            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15196
0
                return -1;
15197
0
            return 1;
15198
0
        }
15199
15200
34.5M
        if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15201
34.5M
            *p_str = Py_NewRef(v);
15202
34.5M
        }
15203
0
        else {
15204
0
            if (arg->ch == 's')
15205
0
                *p_str = PyObject_Str(v);
15206
0
            else if (arg->ch == 'r')
15207
0
                *p_str = PyObject_Repr(v);
15208
0
            else
15209
0
                *p_str = PyObject_ASCII(v);
15210
0
        }
15211
34.5M
        break;
15212
15213
0
    case 'i':
15214
10.9M
    case 'd':
15215
10.9M
    case 'u':
15216
10.9M
    case 'o':
15217
10.9M
    case 'x':
15218
10.9M
    case 'X':
15219
10.9M
    {
15220
10.9M
        int ret = mainformatlong(v, arg, p_str, writer);
15221
10.9M
        if (ret != 0)
15222
10.9M
            return ret;
15223
1.53k
        arg->sign = 1;
15224
1.53k
        break;
15225
10.9M
    }
15226
15227
0
    case 'e':
15228
0
    case 'E':
15229
0
    case 'f':
15230
0
    case 'F':
15231
0
    case 'g':
15232
0
    case 'G':
15233
0
        if (arg->width == -1 && arg->prec == -1
15234
0
            && !(arg->flags & (F_SIGN | F_BLANK)))
15235
0
        {
15236
            /* Fast path */
15237
0
            if (formatfloat(v, arg, NULL, writer) == -1)
15238
0
                return -1;
15239
0
            return 1;
15240
0
        }
15241
15242
0
        arg->sign = 1;
15243
0
        if (formatfloat(v, arg, p_str, NULL) == -1)
15244
0
            return -1;
15245
0
        break;
15246
15247
0
    case 'c':
15248
0
    {
15249
0
        Py_UCS4 ch = formatchar(v);
15250
0
        if (ch == (Py_UCS4) -1)
15251
0
            return -1;
15252
0
        if (arg->width == -1 && arg->prec == -1) {
15253
            /* Fast path */
15254
0
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
15255
0
                return -1;
15256
0
            return 1;
15257
0
        }
15258
0
        *p_str = PyUnicode_FromOrdinal(ch);
15259
0
        break;
15260
0
    }
15261
15262
0
    default:
15263
0
        PyErr_Format(PyExc_ValueError,
15264
0
                     "unsupported format character '%c' (0x%x) "
15265
0
                     "at index %zd",
15266
0
                     (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15267
0
                     (int)arg->ch,
15268
0
                     ctx->fmtpos - 1);
15269
0
        return -1;
15270
45.4M
    }
15271
34.5M
    if (*p_str == NULL)
15272
0
        return -1;
15273
34.5M
    assert (PyUnicode_Check(*p_str));
15274
34.5M
    return 0;
15275
34.5M
}
15276
15277
static int
15278
unicode_format_arg_output(struct unicode_formatter_t *ctx,
15279
                          struct unicode_format_arg_t *arg,
15280
                          PyObject *str)
15281
34.5M
{
15282
34.5M
    Py_ssize_t len;
15283
34.5M
    int kind;
15284
34.5M
    const void *pbuf;
15285
34.5M
    Py_ssize_t pindex;
15286
34.5M
    Py_UCS4 signchar;
15287
34.5M
    Py_ssize_t buflen;
15288
34.5M
    Py_UCS4 maxchar;
15289
34.5M
    Py_ssize_t sublen;
15290
34.5M
    _PyUnicodeWriter *writer = &ctx->writer;
15291
34.5M
    Py_UCS4 fill;
15292
15293
34.5M
    fill = ' ';
15294
34.5M
    if (arg->sign && arg->flags & F_ZERO)
15295
1.53k
        fill = '0';
15296
15297
34.5M
    len = PyUnicode_GET_LENGTH(str);
15298
34.5M
    if ((arg->width == -1 || arg->width <= len)
15299
34.5M
        && (arg->prec == -1 || arg->prec >= len)
15300
34.5M
        && !(arg->flags & (F_SIGN | F_BLANK)))
15301
34.5M
    {
15302
        /* Fast path */
15303
34.5M
        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15304
0
            return -1;
15305
34.5M
        return 0;
15306
34.5M
    }
15307
15308
    /* Truncate the string for "s", "r" and "a" formats
15309
       if the precision is set */
15310
96
    if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15311
0
        if (arg->prec >= 0 && len > arg->prec)
15312
0
            len = arg->prec;
15313
0
    }
15314
15315
    /* Adjust sign and width */
15316
96
    kind = PyUnicode_KIND(str);
15317
96
    pbuf = PyUnicode_DATA(str);
15318
96
    pindex = 0;
15319
96
    signchar = '\0';
15320
96
    if (arg->sign) {
15321
96
        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15322
96
        if (ch == '-' || ch == '+') {
15323
0
            signchar = ch;
15324
0
            len--;
15325
0
            pindex++;
15326
0
        }
15327
96
        else if (arg->flags & F_SIGN)
15328
0
            signchar = '+';
15329
96
        else if (arg->flags & F_BLANK)
15330
0
            signchar = ' ';
15331
96
        else
15332
96
            arg->sign = 0;
15333
96
    }
15334
96
    if (arg->width < len)
15335
0
        arg->width = len;
15336
15337
    /* Prepare the writer */
15338
96
    maxchar = writer->maxchar;
15339
96
    if (!(arg->flags & F_LJUST)) {
15340
96
        if (arg->sign) {
15341
0
            if ((arg->width-1) > len)
15342
0
                maxchar = Py_MAX(maxchar, fill);
15343
0
        }
15344
96
        else {
15345
96
            if (arg->width > len)
15346
96
                maxchar = Py_MAX(maxchar, fill);
15347
96
        }
15348
96
    }
15349
96
    if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15350
0
        Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
15351
0
        maxchar = Py_MAX(maxchar, strmaxchar);
15352
0
    }
15353
15354
96
    buflen = arg->width;
15355
96
    if (arg->sign && len == arg->width)
15356
0
        buflen++;
15357
96
    if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
15358
0
        return -1;
15359
15360
    /* Write the sign if needed */
15361
96
    if (arg->sign) {
15362
0
        if (fill != ' ') {
15363
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15364
0
            writer->pos += 1;
15365
0
        }
15366
0
        if (arg->width > len)
15367
0
            arg->width--;
15368
0
    }
15369
15370
    /* Write the numeric prefix for "x", "X" and "o" formats
15371
       if the alternate form is used.
15372
       For example, write "0x" for the "%#x" format. */
15373
96
    if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15374
0
        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15375
0
        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15376
0
        if (fill != ' ') {
15377
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15378
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15379
0
            writer->pos += 2;
15380
0
            pindex += 2;
15381
0
        }
15382
0
        arg->width -= 2;
15383
0
        if (arg->width < 0)
15384
0
            arg->width = 0;
15385
0
        len -= 2;
15386
0
    }
15387
15388
    /* Pad left with the fill character if needed */
15389
96
    if (arg->width > len && !(arg->flags & F_LJUST)) {
15390
96
        sublen = arg->width - len;
15391
96
        unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
15392
96
        writer->pos += sublen;
15393
96
        arg->width = len;
15394
96
    }
15395
15396
    /* If padding with spaces: write sign if needed and/or numeric prefix if
15397
       the alternate form is used */
15398
96
    if (fill == ' ') {
15399
0
        if (arg->sign) {
15400
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15401
0
            writer->pos += 1;
15402
0
        }
15403
0
        if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15404
0
            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15405
0
            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15406
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15407
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15408
0
            writer->pos += 2;
15409
0
            pindex += 2;
15410
0
        }
15411
0
    }
15412
15413
    /* Write characters */
15414
96
    if (len) {
15415
96
        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15416
96
                                      str, pindex, len);
15417
96
        writer->pos += len;
15418
96
    }
15419
15420
    /* Pad right with the fill character if needed */
15421
96
    if (arg->width > len) {
15422
0
        sublen = arg->width - len;
15423
0
        unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
15424
0
        writer->pos += sublen;
15425
0
    }
15426
96
    return 0;
15427
96
}
15428
15429
/* Helper of PyUnicode_Format(): format one arg.
15430
   Return 0 on success, raise an exception and return -1 on error. */
15431
static int
15432
unicode_format_arg(struct unicode_formatter_t *ctx)
15433
45.4M
{
15434
45.4M
    struct unicode_format_arg_t arg;
15435
45.4M
    PyObject *str;
15436
45.4M
    int ret;
15437
15438
45.4M
    arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
15439
45.4M
    if (arg.ch == '%') {
15440
0
        ctx->fmtpos++;
15441
0
        ctx->fmtcnt--;
15442
0
        if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15443
0
            return -1;
15444
0
        return 0;
15445
0
    }
15446
45.4M
    arg.flags = 0;
15447
45.4M
    arg.width = -1;
15448
45.4M
    arg.prec = -1;
15449
45.4M
    arg.sign = 0;
15450
45.4M
    str = NULL;
15451
15452
45.4M
    ret = unicode_format_arg_parse(ctx, &arg);
15453
45.4M
    if (ret == -1)
15454
0
        return -1;
15455
15456
45.4M
    ret = unicode_format_arg_format(ctx, &arg, &str);
15457
45.4M
    if (ret == -1)
15458
4.33M
        return -1;
15459
15460
41.1M
    if (ret != 1) {
15461
34.5M
        ret = unicode_format_arg_output(ctx, &arg, str);
15462
34.5M
        Py_DECREF(str);
15463
34.5M
        if (ret == -1)
15464
0
            return -1;
15465
34.5M
    }
15466
15467
41.1M
    if (ctx->dict && (ctx->argidx < ctx->arglen)) {
15468
0
        PyErr_SetString(PyExc_TypeError,
15469
0
                        "not all arguments converted during string formatting");
15470
0
        return -1;
15471
0
    }
15472
41.1M
    return 0;
15473
41.1M
}
15474
15475
PyObject *
15476
PyUnicode_Format(PyObject *format, PyObject *args)
15477
23.6M
{
15478
23.6M
    struct unicode_formatter_t ctx;
15479
15480
23.6M
    if (format == NULL || args == NULL) {
15481
0
        PyErr_BadInternalCall();
15482
0
        return NULL;
15483
0
    }
15484
15485
23.6M
    if (ensure_unicode(format) < 0)
15486
0
        return NULL;
15487
15488
23.6M
    ctx.fmtstr = format;
15489
23.6M
    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15490
23.6M
    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15491
23.6M
    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15492
23.6M
    ctx.fmtpos = 0;
15493
15494
23.6M
    _PyUnicodeWriter_Init(&ctx.writer);
15495
23.6M
    ctx.writer.min_length = ctx.fmtcnt + 100;
15496
23.6M
    ctx.writer.overallocate = 1;
15497
15498
23.6M
    if (PyTuple_Check(args)) {
15499
5.53M
        ctx.arglen = PyTuple_Size(args);
15500
5.53M
        ctx.argidx = 0;
15501
5.53M
    }
15502
18.1M
    else {
15503
18.1M
        ctx.arglen = -1;
15504
18.1M
        ctx.argidx = -2;
15505
18.1M
    }
15506
23.6M
    ctx.args_owned = 0;
15507
23.6M
    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
15508
10.9k
        ctx.dict = args;
15509
23.6M
    else
15510
23.6M
        ctx.dict = NULL;
15511
23.6M
    ctx.args = args;
15512
15513
112M
    while (--ctx.fmtcnt >= 0) {
15514
92.9M
        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15515
47.4M
            Py_ssize_t nonfmtpos;
15516
15517
47.4M
            nonfmtpos = ctx.fmtpos++;
15518
455M
            while (ctx.fmtcnt >= 0 &&
15519
455M
                   PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15520
407M
                ctx.fmtpos++;
15521
407M
                ctx.fmtcnt--;
15522
407M
            }
15523
47.4M
            if (ctx.fmtcnt < 0) {
15524
12.7M
                ctx.fmtpos--;
15525
12.7M
                ctx.writer.overallocate = 0;
15526
12.7M
            }
15527
15528
47.4M
            if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15529
47.4M
                                                nonfmtpos, ctx.fmtpos) < 0)
15530
0
                goto onError;
15531
47.4M
        }
15532
45.4M
        else {
15533
45.4M
            ctx.fmtpos++;
15534
45.4M
            if (unicode_format_arg(&ctx) == -1)
15535
4.33M
                goto onError;
15536
45.4M
        }
15537
92.9M
    }
15538
15539
19.3M
    if (ctx.argidx < ctx.arglen && !ctx.dict) {
15540
0
        PyErr_SetString(PyExc_TypeError,
15541
0
                        "not all arguments converted during string formatting");
15542
0
        goto onError;
15543
0
    }
15544
15545
19.3M
    if (ctx.args_owned) {
15546
10.9k
        Py_DECREF(ctx.args);
15547
10.9k
    }
15548
19.3M
    return _PyUnicodeWriter_Finish(&ctx.writer);
15549
15550
4.33M
  onError:
15551
4.33M
    _PyUnicodeWriter_Dealloc(&ctx.writer);
15552
4.33M
    if (ctx.args_owned) {
15553
0
        Py_DECREF(ctx.args);
15554
0
    }
15555
4.33M
    return NULL;
15556
19.3M
}
15557
15558
static PyObject *
15559
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15560
15561
/*[clinic input]
15562
@classmethod
15563
str.__new__ as unicode_new
15564
15565
    object as x: object = NULL
15566
    encoding: str = NULL
15567
    errors: str = NULL
15568
15569
[clinic start generated code]*/
15570
15571
static PyObject *
15572
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15573
                 const char *errors)
15574
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
15575
9.82M
{
15576
9.82M
    PyObject *unicode;
15577
9.82M
    if (x == NULL) {
15578
0
        unicode = unicode_get_empty();
15579
0
    }
15580
9.82M
    else if (encoding == NULL && errors == NULL) {
15581
9.82M
        unicode = PyObject_Str(x);
15582
9.82M
    }
15583
0
    else {
15584
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15585
0
    }
15586
15587
9.82M
    if (unicode != NULL && type != &PyUnicode_Type) {
15588
9.82M
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15589
9.82M
    }
15590
9.82M
    return unicode;
15591
9.82M
}
15592
15593
static const char *
15594
arg_as_utf8(PyObject *obj, const char *name)
15595
1.57M
{
15596
1.57M
    if (!PyUnicode_Check(obj)) {
15597
0
        PyErr_Format(PyExc_TypeError,
15598
0
                     "str() argument '%s' must be str, not %T",
15599
0
                     name, obj);
15600
0
        return NULL;
15601
0
    }
15602
1.57M
    return _PyUnicode_AsUTF8NoNUL(obj);
15603
1.57M
}
15604
15605
static PyObject *
15606
unicode_vectorcall(PyObject *type, PyObject *const *args,
15607
                   size_t nargsf, PyObject *kwnames)
15608
1.35M
{
15609
1.35M
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
15610
15611
1.35M
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
15612
1.35M
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
15613
        // Fallback to unicode_new()
15614
0
        PyObject *tuple = _PyTuple_FromArray(args, nargs);
15615
0
        if (tuple == NULL) {
15616
0
            return NULL;
15617
0
        }
15618
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
15619
0
        if (dict == NULL) {
15620
0
            Py_DECREF(tuple);
15621
0
            return NULL;
15622
0
        }
15623
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
15624
0
        Py_DECREF(tuple);
15625
0
        Py_DECREF(dict);
15626
0
        return ret;
15627
0
    }
15628
1.35M
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
15629
0
        return NULL;
15630
0
    }
15631
1.35M
    if (nargs == 0) {
15632
0
        return unicode_get_empty();
15633
0
    }
15634
1.35M
    PyObject *object = args[0];
15635
1.35M
    if (nargs == 1) {
15636
426
        return PyObject_Str(object);
15637
426
    }
15638
1.35M
    const char *encoding = arg_as_utf8(args[1], "encoding");
15639
1.35M
    if (encoding == NULL) {
15640
160
        return NULL;
15641
160
    }
15642
1.35M
    const char *errors = NULL;
15643
1.35M
    if (nargs == 3) {
15644
217k
        errors = arg_as_utf8(args[2], "errors");
15645
217k
        if (errors == NULL) {
15646
0
            return NULL;
15647
0
        }
15648
217k
    }
15649
1.35M
    return PyUnicode_FromEncodedObject(object, encoding, errors);
15650
1.35M
}
15651
15652
static PyObject *
15653
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
15654
9.82M
{
15655
9.82M
    PyObject *self;
15656
9.82M
    Py_ssize_t length, char_size;
15657
9.82M
    int share_utf8;
15658
9.82M
    int kind;
15659
9.82M
    void *data;
15660
15661
9.82M
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
15662
9.82M
    assert(_PyUnicode_CHECK(unicode));
15663
15664
9.82M
    self = type->tp_alloc(type, 0);
15665
9.82M
    if (self == NULL) {
15666
0
        return NULL;
15667
0
    }
15668
9.82M
    kind = PyUnicode_KIND(unicode);
15669
9.82M
    length = PyUnicode_GET_LENGTH(unicode);
15670
15671
9.82M
    _PyUnicode_LENGTH(self) = length;
15672
#ifdef Py_DEBUG
15673
    _PyUnicode_HASH(self) = -1;
15674
#else
15675
9.82M
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15676
9.82M
#endif
15677
9.82M
    _PyUnicode_STATE(self).interned = 0;
15678
9.82M
    _PyUnicode_STATE(self).kind = kind;
15679
9.82M
    _PyUnicode_STATE(self).compact = 0;
15680
9.82M
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15681
9.82M
    _PyUnicode_STATE(self).statically_allocated = 0;
15682
9.82M
    PyUnicode_SET_UTF8_LENGTH(self, 0);
15683
9.82M
    PyUnicode_SET_UTF8(self, NULL);
15684
9.82M
    _PyUnicode_DATA_ANY(self) = NULL;
15685
15686
9.82M
    share_utf8 = 0;
15687
9.82M
    if (kind == PyUnicode_1BYTE_KIND) {
15688
8.70M
        char_size = 1;
15689
8.70M
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15690
8.67M
            share_utf8 = 1;
15691
8.70M
    }
15692
1.11M
    else if (kind == PyUnicode_2BYTE_KIND) {
15693
1.06M
        char_size = 2;
15694
1.06M
    }
15695
50.2k
    else {
15696
50.2k
        assert(kind == PyUnicode_4BYTE_KIND);
15697
50.2k
        char_size = 4;
15698
50.2k
    }
15699
15700
    /* Ensure we won't overflow the length. */
15701
9.82M
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15702
0
        PyErr_NoMemory();
15703
0
        goto onError;
15704
0
    }
15705
9.82M
    data = PyMem_Malloc((length + 1) * char_size);
15706
9.82M
    if (data == NULL) {
15707
0
        PyErr_NoMemory();
15708
0
        goto onError;
15709
0
    }
15710
15711
9.82M
    _PyUnicode_DATA_ANY(self) = data;
15712
9.82M
    if (share_utf8) {
15713
8.67M
        PyUnicode_SET_UTF8_LENGTH(self, length);
15714
8.67M
        PyUnicode_SET_UTF8(self, data);
15715
8.67M
    }
15716
15717
9.82M
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
15718
9.82M
    assert(_PyUnicode_CheckConsistency(self, 1));
15719
#ifdef Py_DEBUG
15720
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15721
#endif
15722
9.82M
    return self;
15723
15724
0
onError:
15725
0
    Py_DECREF(self);
15726
0
    return NULL;
15727
9.82M
}
15728
15729
void
15730
_PyUnicode_ExactDealloc(PyObject *op)
15731
54.2M
{
15732
54.2M
    assert(PyUnicode_CheckExact(op));
15733
54.2M
    unicode_dealloc(op);
15734
54.2M
}
15735
15736
PyDoc_STRVAR(unicode_doc,
15737
"str(object='') -> str\n\
15738
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15739
\n\
15740
Create a new string object from the given object. If encoding or\n\
15741
errors is specified, then the object must expose a data buffer\n\
15742
that will be decoded using the given encoding and error handler.\n\
15743
Otherwise, returns the result of object.__str__() (if defined)\n\
15744
or repr(object).\n\
15745
encoding defaults to 'utf-8'.\n\
15746
errors defaults to 'strict'.");
15747
15748
static PyObject *unicode_iter(PyObject *seq);
15749
15750
PyTypeObject PyUnicode_Type = {
15751
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15752
    "str",                        /* tp_name */
15753
    sizeof(PyUnicodeObject),      /* tp_basicsize */
15754
    0,                            /* tp_itemsize */
15755
    /* Slots */
15756
    unicode_dealloc,              /* tp_dealloc */
15757
    0,                            /* tp_vectorcall_offset */
15758
    0,                            /* tp_getattr */
15759
    0,                            /* tp_setattr */
15760
    0,                            /* tp_as_async */
15761
    unicode_repr,                 /* tp_repr */
15762
    &unicode_as_number,           /* tp_as_number */
15763
    &unicode_as_sequence,         /* tp_as_sequence */
15764
    &unicode_as_mapping,          /* tp_as_mapping */
15765
    unicode_hash,                 /* tp_hash*/
15766
    0,                            /* tp_call*/
15767
    unicode_str,                  /* tp_str */
15768
    PyObject_GenericGetAttr,      /* tp_getattro */
15769
    0,                            /* tp_setattro */
15770
    0,                            /* tp_as_buffer */
15771
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15772
        Py_TPFLAGS_UNICODE_SUBCLASS |
15773
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
15774
    unicode_doc,                  /* tp_doc */
15775
    0,                            /* tp_traverse */
15776
    0,                            /* tp_clear */
15777
    PyUnicode_RichCompare,        /* tp_richcompare */
15778
    0,                            /* tp_weaklistoffset */
15779
    unicode_iter,                 /* tp_iter */
15780
    0,                            /* tp_iternext */
15781
    unicode_methods,              /* tp_methods */
15782
    0,                            /* tp_members */
15783
    0,                            /* tp_getset */
15784
    0,                            /* tp_base */
15785
    0,                            /* tp_dict */
15786
    0,                            /* tp_descr_get */
15787
    0,                            /* tp_descr_set */
15788
    0,                            /* tp_dictoffset */
15789
    0,                            /* tp_init */
15790
    0,                            /* tp_alloc */
15791
    unicode_new,                  /* tp_new */
15792
    PyObject_Free,                /* tp_free */
15793
    .tp_vectorcall = unicode_vectorcall,
15794
};
15795
15796
/* Initialize the Unicode implementation */
15797
15798
static void
15799
_init_global_state(void)
15800
16
{
15801
16
    static int initialized = 0;
15802
16
    if (initialized) {
15803
0
        return;
15804
0
    }
15805
16
    initialized = 1;
15806
15807
    /* initialize the linebreak bloom filter */
15808
16
    const Py_UCS2 linebreak[] = {
15809
16
        0x000A, /* LINE FEED */
15810
16
        0x000D, /* CARRIAGE RETURN */
15811
16
        0x001C, /* FILE SEPARATOR */
15812
16
        0x001D, /* GROUP SEPARATOR */
15813
16
        0x001E, /* RECORD SEPARATOR */
15814
16
        0x0085, /* NEXT LINE */
15815
16
        0x2028, /* LINE SEPARATOR */
15816
16
        0x2029, /* PARAGRAPH SEPARATOR */
15817
16
    };
15818
16
    bloom_linebreak = make_bloom_mask(
15819
16
        PyUnicode_2BYTE_KIND, linebreak,
15820
16
        Py_ARRAY_LENGTH(linebreak));
15821
16
}
15822
15823
void
15824
_PyUnicode_InitState(PyInterpreterState *interp)
15825
16
{
15826
16
    if (!_Py_IsMainInterpreter(interp)) {
15827
0
        return;
15828
0
    }
15829
16
    _init_global_state();
15830
16
}
15831
15832
15833
PyStatus
15834
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
15835
16
{
15836
16
    if (_Py_IsMainInterpreter(interp)) {
15837
16
        PyStatus status = init_global_interned_strings(interp);
15838
16
        if (_PyStatus_EXCEPTION(status)) {
15839
0
            return status;
15840
0
        }
15841
16
    }
15842
16
    assert(INTERNED_STRINGS);
15843
15844
16
    if (init_interned_dict(interp)) {
15845
0
        PyErr_Clear();
15846
0
        return _PyStatus_ERR("failed to create interned dict");
15847
0
    }
15848
15849
16
    return _PyStatus_OK();
15850
16
}
15851
15852
15853
PyStatus
15854
_PyUnicode_InitTypes(PyInterpreterState *interp)
15855
16
{
15856
16
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
15857
0
        goto error;
15858
0
    }
15859
16
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
15860
0
        goto error;
15861
0
    }
15862
16
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
15863
0
        goto error;
15864
0
    }
15865
16
    return _PyStatus_OK();
15866
15867
0
error:
15868
0
    return _PyStatus_ERR("Can't initialize unicode types");
15869
16
}
15870
15871
static /* non-null */ PyObject*
15872
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
15873
16.9k
{
15874
    // Note that this steals a reference to `s`, but in many cases that
15875
    // stolen ref is returned, requiring no decref/incref.
15876
15877
16.9k
    assert(s != NULL);
15878
16.9k
    assert(_PyUnicode_CHECK(s));
15879
16.9k
    assert(_PyUnicode_STATE(s).statically_allocated);
15880
16.9k
    assert(!PyUnicode_CHECK_INTERNED(s));
15881
15882
#ifdef Py_DEBUG
15883
    /* We must not add process-global interned string if there's already a
15884
     * per-interpreter interned_dict, which might contain duplicates.
15885
     */
15886
    PyObject *interned = get_interned_dict(interp);
15887
    assert(interned == NULL);
15888
#endif
15889
15890
    /* Look in the global cache first. */
15891
16.9k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
15892
    /* We should only init each string once */
15893
16.9k
    assert(r == NULL);
15894
    /* but just in case (for the non-debug build), handle this */
15895
16.9k
    if (r != NULL && r != s) {
15896
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
15897
0
        assert(_PyUnicode_CHECK(r));
15898
0
        Py_DECREF(s);
15899
0
        return Py_NewRef(r);
15900
0
    }
15901
15902
16.9k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
15903
0
        Py_FatalError("failed to intern static string");
15904
0
    }
15905
15906
16.9k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
15907
16.9k
    return s;
15908
16.9k
}
15909
15910
void
15911
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
15912
16.9k
{
15913
    // This should only be called as part of runtime initialization
15914
16.9k
    assert(!Py_IsInitialized());
15915
15916
16.9k
    *p = intern_static(interp, *p);
15917
16.9k
    assert(*p);
15918
16.9k
}
15919
15920
static void
15921
immortalize_interned(PyObject *s)
15922
94.8k
{
15923
94.8k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
15924
94.8k
    assert(!_Py_IsImmortal(s));
15925
#ifdef Py_REF_DEBUG
15926
    /* The reference count value should be excluded from the RefTotal.
15927
       The decrements to these objects will not be registered so they
15928
       need to be accounted for in here. */
15929
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
15930
        _Py_DecRefTotal(_PyThreadState_GET());
15931
    }
15932
#endif
15933
94.8k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
15934
94.8k
    _Py_SetImmortal(s);
15935
94.8k
}
15936
15937
static /* non-null */ PyObject*
15938
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
15939
              bool immortalize)
15940
34.4M
{
15941
    // Note that this steals a reference to `s`, but in many cases that
15942
    // stolen ref is returned, requiring no decref/incref.
15943
15944
#ifdef Py_DEBUG
15945
    assert(s != NULL);
15946
    assert(_PyUnicode_CHECK(s));
15947
#else
15948
34.4M
    if (s == NULL || !PyUnicode_Check(s)) {
15949
0
        return s;
15950
0
    }
15951
34.4M
#endif
15952
15953
    /* If it's a subclass, we don't really know what putting
15954
       it in the interned dict might do. */
15955
34.4M
    if (!PyUnicode_CheckExact(s)) {
15956
0
        return s;
15957
0
    }
15958
15959
    /* Is it already interned? */
15960
34.4M
    switch (PyUnicode_CHECK_INTERNED(s)) {
15961
3.16M
        case SSTATE_NOT_INTERNED:
15962
            // no, go on
15963
3.16M
            break;
15964
13.8k
        case SSTATE_INTERNED_MORTAL:
15965
            // yes but we might need to make it immortal
15966
13.8k
            if (immortalize) {
15967
34
                immortalize_interned(s);
15968
34
            }
15969
13.8k
            return s;
15970
31.2M
        default:
15971
            // all done
15972
31.2M
            return s;
15973
34.4M
    }
15974
15975
    /* Statically allocated strings must be already interned. */
15976
3.16M
    assert(!_PyUnicode_STATE(s).statically_allocated);
15977
15978
#if Py_GIL_DISABLED
15979
    /* In the free-threaded build, all interned strings are immortal */
15980
    immortalize = 1;
15981
#endif
15982
15983
    /* If it's already immortal, intern it as such */
15984
3.16M
    if (_Py_IsImmortal(s)) {
15985
0
        immortalize = 1;
15986
0
    }
15987
15988
    /* if it's a short string, get the singleton */
15989
3.16M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
15990
3.16M
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
15991
0
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
15992
0
        assert(PyUnicode_CHECK_INTERNED(r));
15993
0
        Py_DECREF(s);
15994
0
        return r;
15995
0
    }
15996
#ifdef Py_DEBUG
15997
    assert(!unicode_is_singleton(s));
15998
#endif
15999
16000
    /* Look in the global cache now. */
16001
3.16M
    {
16002
3.16M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
16003
3.16M
        if (r != NULL) {
16004
278k
            assert(_PyUnicode_STATE(r).statically_allocated);
16005
278k
            assert(r != s);  // r must be statically_allocated; s is not
16006
278k
            Py_DECREF(s);
16007
278k
            return Py_NewRef(r);
16008
278k
        }
16009
3.16M
    }
16010
16011
    /* Do a setdefault on the per-interpreter cache. */
16012
2.88M
    PyObject *interned = get_interned_dict(interp);
16013
2.88M
    assert(interned != NULL);
16014
#ifdef Py_GIL_DISABLED
16015
#  define INTERN_MUTEX &_Py_INTERP_CACHED_OBJECT(interp, interned_mutex)
16016
#endif
16017
2.88M
    FT_MUTEX_LOCK(INTERN_MUTEX);
16018
2.88M
    PyObject *t;
16019
2.88M
    {
16020
2.88M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
16021
2.88M
        if (res < 0) {
16022
0
            PyErr_Clear();
16023
0
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
16024
0
            return s;
16025
0
        }
16026
2.88M
        else if (res == 1) {
16027
            // value was already present (not inserted)
16028
2.35M
            Py_DECREF(s);
16029
2.35M
            if (immortalize &&
16030
2.35M
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
16031
4.19k
                immortalize_interned(t);
16032
4.19k
            }
16033
2.35M
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
16034
2.35M
            return t;
16035
2.35M
        }
16036
531k
        else {
16037
            // value was newly inserted
16038
531k
            assert (s == t);
16039
531k
            Py_DECREF(t);
16040
531k
        }
16041
2.88M
    }
16042
16043
    /* NOT_INTERNED -> INTERNED_MORTAL */
16044
16045
531k
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
16046
16047
531k
    if (!_Py_IsImmortal(s)) {
16048
        /* The two references in interned dict (key and value) are not counted.
16049
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
16050
531k
        Py_DECREF(s);
16051
531k
        Py_DECREF(s);
16052
531k
    }
16053
531k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
16054
16055
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
16056
16057
#ifdef Py_DEBUG
16058
    if (_Py_IsImmortal(s)) {
16059
        assert(immortalize);
16060
    }
16061
#endif
16062
531k
    if (immortalize) {
16063
90.5k
        immortalize_interned(s);
16064
90.5k
    }
16065
16066
531k
    FT_MUTEX_UNLOCK(INTERN_MUTEX);
16067
531k
    return s;
16068
2.88M
}
16069
16070
void
16071
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
16072
2.77M
{
16073
2.77M
    *p = intern_common(interp, *p, 1);
16074
2.77M
    assert(*p);
16075
2.77M
}
16076
16077
void
16078
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
16079
31.6M
{
16080
31.6M
    *p = intern_common(interp, *p, 0);
16081
31.6M
    assert(*p);
16082
31.6M
}
16083
16084
16085
void
16086
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
16087
0
{
16088
0
    _PyUnicode_InternImmortal(interp, p);
16089
0
    return;
16090
0
}
16091
16092
void
16093
PyUnicode_InternInPlace(PyObject **p)
16094
0
{
16095
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
16096
0
    _PyUnicode_InternMortal(interp, p);
16097
0
}
16098
16099
// Public-looking name kept for the stable ABI; user should not call this:
16100
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
16101
void
16102
PyUnicode_InternImmortal(PyObject **p)
16103
0
{
16104
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
16105
0
    _PyUnicode_InternImmortal(interp, p);
16106
0
}
16107
16108
PyObject *
16109
PyUnicode_InternFromString(const char *cp)
16110
1.15M
{
16111
1.15M
    PyObject *s = PyUnicode_FromString(cp);
16112
1.15M
    if (s == NULL) {
16113
0
        return NULL;
16114
0
    }
16115
1.15M
    PyInterpreterState *interp = _PyInterpreterState_GET();
16116
1.15M
    _PyUnicode_InternMortal(interp, &s);
16117
1.15M
    return s;
16118
1.15M
}
16119
16120
16121
void
16122
_PyUnicode_ClearInterned(PyInterpreterState *interp)
16123
0
{
16124
0
    PyObject *interned = get_interned_dict(interp);
16125
0
    if (interned == NULL) {
16126
0
        return;
16127
0
    }
16128
0
    assert(PyDict_CheckExact(interned));
16129
16130
0
    if (has_shared_intern_dict(interp)) {
16131
        // the dict doesn't belong to this interpreter, skip the debug
16132
        // checks on it and just clear the pointer to it
16133
0
        clear_interned_dict(interp);
16134
0
        return;
16135
0
    }
16136
16137
#ifdef INTERNED_STATS
16138
    fprintf(stderr, "releasing %zd interned strings\n",
16139
            PyDict_GET_SIZE(interned));
16140
16141
    Py_ssize_t total_length = 0;
16142
#endif
16143
0
    Py_ssize_t pos = 0;
16144
0
    PyObject *s, *ignored_value;
16145
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
16146
0
        int shared = 0;
16147
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
16148
0
        case SSTATE_INTERNED_IMMORTAL:
16149
            /* Make immortal interned strings mortal again. */
16150
            // Skip the Immortal Instance check and restore
16151
            // the two references (key and value) ignored
16152
            // by PyUnicode_InternInPlace().
16153
0
            _Py_SetMortal(s, 2);
16154
#ifdef Py_REF_DEBUG
16155
            /* let's be pedantic with the ref total */
16156
            _Py_IncRefTotal(_PyThreadState_GET());
16157
            _Py_IncRefTotal(_PyThreadState_GET());
16158
#endif
16159
#ifdef INTERNED_STATS
16160
            total_length += PyUnicode_GET_LENGTH(s);
16161
#endif
16162
0
            break;
16163
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
16164
            /* It is shared between interpreters, so we should unmark it
16165
               only when this is the last interpreter in which it's
16166
               interned.  We immortalize all the statically initialized
16167
               strings during startup, so we can rely on the
16168
               main interpreter to be the last one. */
16169
0
            if (!_Py_IsMainInterpreter(interp)) {
16170
0
                shared = 1;
16171
0
            }
16172
0
            break;
16173
0
        case SSTATE_INTERNED_MORTAL:
16174
            // Restore 2 references held by the interned dict; these will
16175
            // be decref'd by clear_interned_dict's PyDict_Clear.
16176
0
            _Py_RefcntAdd(s, 2);
16177
#ifdef Py_REF_DEBUG
16178
            /* let's be pedantic with the ref total */
16179
            _Py_IncRefTotal(_PyThreadState_GET());
16180
            _Py_IncRefTotal(_PyThreadState_GET());
16181
#endif
16182
0
            break;
16183
0
        case SSTATE_NOT_INTERNED:
16184
0
            _Py_FALLTHROUGH;
16185
0
        default:
16186
0
            Py_UNREACHABLE();
16187
0
        }
16188
0
        if (!shared) {
16189
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
16190
0
        }
16191
0
    }
16192
#ifdef INTERNED_STATS
16193
    fprintf(stderr,
16194
            "total length of all interned strings: %zd characters\n",
16195
            total_length);
16196
#endif
16197
16198
0
    struct _Py_unicode_state *state = &interp->unicode;
16199
0
    struct _Py_unicode_ids *ids = &state->ids;
16200
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
16201
0
        Py_XINCREF(ids->array[i]);
16202
0
    }
16203
0
    clear_interned_dict(interp);
16204
0
    if (_Py_IsMainInterpreter(interp)) {
16205
0
        clear_global_interned_strings();
16206
0
    }
16207
0
}
16208
16209
16210
/********************* Unicode Iterator **************************/
16211
16212
typedef struct {
16213
    PyObject_HEAD
16214
    Py_ssize_t it_index;
16215
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
16216
} unicodeiterobject;
16217
16218
static void
16219
unicodeiter_dealloc(PyObject *op)
16220
1.77M
{
16221
1.77M
    unicodeiterobject *it = (unicodeiterobject *)op;
16222
1.77M
    _PyObject_GC_UNTRACK(it);
16223
1.77M
    Py_XDECREF(it->it_seq);
16224
1.77M
    PyObject_GC_Del(it);
16225
1.77M
}
16226
16227
static int
16228
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
16229
14
{
16230
14
    unicodeiterobject *it = (unicodeiterobject *)op;
16231
14
    Py_VISIT(it->it_seq);
16232
14
    return 0;
16233
14
}
16234
16235
static PyObject *
16236
unicodeiter_next(PyObject *op)
16237
125M
{
16238
125M
    unicodeiterobject *it = (unicodeiterobject *)op;
16239
125M
    PyObject *seq;
16240
16241
125M
    assert(it != NULL);
16242
125M
    seq = it->it_seq;
16243
125M
    if (seq == NULL)
16244
0
        return NULL;
16245
125M
    assert(_PyUnicode_CHECK(seq));
16246
16247
125M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
16248
124M
        int kind = PyUnicode_KIND(seq);
16249
124M
        const void *data = PyUnicode_DATA(seq);
16250
124M
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
16251
124M
        it->it_index++;
16252
124M
        return unicode_char(chr);
16253
124M
    }
16254
16255
841k
    it->it_seq = NULL;
16256
841k
    Py_DECREF(seq);
16257
841k
    return NULL;
16258
125M
}
16259
16260
static PyObject *
16261
unicode_ascii_iter_next(PyObject *op)
16262
95.4M
{
16263
95.4M
    unicodeiterobject *it = (unicodeiterobject *)op;
16264
95.4M
    assert(it != NULL);
16265
95.4M
    PyObject *seq = it->it_seq;
16266
95.4M
    if (seq == NULL) {
16267
0
        return NULL;
16268
0
    }
16269
95.4M
    assert(_PyUnicode_CHECK(seq));
16270
95.4M
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
16271
95.4M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
16272
94.5M
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
16273
94.5M
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
16274
94.5M
                                              data, it->it_index);
16275
94.5M
        it->it_index++;
16276
94.5M
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
16277
94.5M
    }
16278
835k
    it->it_seq = NULL;
16279
835k
    Py_DECREF(seq);
16280
835k
    return NULL;
16281
95.4M
}
16282
16283
static PyObject *
16284
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
16285
0
{
16286
0
    unicodeiterobject *it = (unicodeiterobject *)op;
16287
0
    Py_ssize_t len = 0;
16288
0
    if (it->it_seq)
16289
0
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
16290
0
    return PyLong_FromSsize_t(len);
16291
0
}
16292
16293
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
16294
16295
static PyObject *
16296
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
16297
0
{
16298
0
    unicodeiterobject *it = (unicodeiterobject *)op;
16299
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
16300
16301
    /* _PyEval_GetBuiltin can invoke arbitrary code,
16302
     * call must be before access of iterator pointers.
16303
     * see issue #101765 */
16304
16305
0
    if (it->it_seq != NULL) {
16306
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
16307
0
    } else {
16308
0
        PyObject *u = unicode_get_empty();
16309
0
        if (u == NULL) {
16310
0
            Py_XDECREF(iter);
16311
0
            return NULL;
16312
0
        }
16313
0
        return Py_BuildValue("N(N)", iter, u);
16314
0
    }
16315
0
}
16316
16317
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
16318
16319
static PyObject *
16320
unicodeiter_setstate(PyObject *op, PyObject *state)
16321
0
{
16322
0
    unicodeiterobject *it = (unicodeiterobject *)op;
16323
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
16324
0
    if (index == -1 && PyErr_Occurred())
16325
0
        return NULL;
16326
0
    if (it->it_seq != NULL) {
16327
0
        if (index < 0)
16328
0
            index = 0;
16329
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
16330
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
16331
0
        it->it_index = index;
16332
0
    }
16333
0
    Py_RETURN_NONE;
16334
0
}
16335
16336
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
16337
16338
static PyMethodDef unicodeiter_methods[] = {
16339
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
16340
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
16341
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
16342
    {NULL,      NULL}       /* sentinel */
16343
};
16344
16345
PyTypeObject PyUnicodeIter_Type = {
16346
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
16347
    "str_iterator",         /* tp_name */
16348
    sizeof(unicodeiterobject),      /* tp_basicsize */
16349
    0,                  /* tp_itemsize */
16350
    /* methods */
16351
    unicodeiter_dealloc,/* tp_dealloc */
16352
    0,                  /* tp_vectorcall_offset */
16353
    0,                  /* tp_getattr */
16354
    0,                  /* tp_setattr */
16355
    0,                  /* tp_as_async */
16356
    0,                  /* tp_repr */
16357
    0,                  /* tp_as_number */
16358
    0,                  /* tp_as_sequence */
16359
    0,                  /* tp_as_mapping */
16360
    0,                  /* tp_hash */
16361
    0,                  /* tp_call */
16362
    0,                  /* tp_str */
16363
    PyObject_GenericGetAttr,        /* tp_getattro */
16364
    0,                  /* tp_setattro */
16365
    0,                  /* tp_as_buffer */
16366
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
16367
    0,                  /* tp_doc */
16368
    unicodeiter_traverse, /* tp_traverse */
16369
    0,                  /* tp_clear */
16370
    0,                  /* tp_richcompare */
16371
    0,                  /* tp_weaklistoffset */
16372
    PyObject_SelfIter,          /* tp_iter */
16373
    unicodeiter_next,   /* tp_iternext */
16374
    unicodeiter_methods,            /* tp_methods */
16375
    0,
16376
};
16377
16378
PyTypeObject _PyUnicodeASCIIIter_Type = {
16379
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
16380
    .tp_name = "str_ascii_iterator",
16381
    .tp_basicsize = sizeof(unicodeiterobject),
16382
    .tp_dealloc = unicodeiter_dealloc,
16383
    .tp_getattro = PyObject_GenericGetAttr,
16384
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
16385
    .tp_traverse = unicodeiter_traverse,
16386
    .tp_iter = PyObject_SelfIter,
16387
    .tp_iternext = unicode_ascii_iter_next,
16388
    .tp_methods = unicodeiter_methods,
16389
};
16390
16391
static PyObject *
16392
unicode_iter(PyObject *seq)
16393
1.77M
{
16394
1.77M
    unicodeiterobject *it;
16395
16396
1.77M
    if (!PyUnicode_Check(seq)) {
16397
0
        PyErr_BadInternalCall();
16398
0
        return NULL;
16399
0
    }
16400
1.77M
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
16401
930k
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
16402
930k
    }
16403
842k
    else {
16404
842k
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16405
842k
    }
16406
1.77M
    if (it == NULL)
16407
0
        return NULL;
16408
1.77M
    it->it_index = 0;
16409
1.77M
    it->it_seq = Py_NewRef(seq);
16410
1.77M
    _PyObject_GC_TRACK(it);
16411
1.77M
    return (PyObject *)it;
16412
1.77M
}
16413
16414
static int
16415
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
16416
64
{
16417
64
    int res;
16418
64
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16419
64
    if (res == -2) {
16420
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
16421
0
        return -1;
16422
0
    }
16423
64
    if (res < 0) {
16424
0
        PyErr_NoMemory();
16425
0
        return -1;
16426
0
    }
16427
64
    return 0;
16428
64
}
16429
16430
16431
static int
16432
config_get_codec_name(wchar_t **config_encoding)
16433
32
{
16434
32
    char *encoding;
16435
32
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16436
0
        return -1;
16437
0
    }
16438
16439
32
    PyObject *name_obj = NULL;
16440
32
    PyObject *codec = _PyCodec_Lookup(encoding);
16441
32
    PyMem_RawFree(encoding);
16442
16443
32
    if (!codec)
16444
0
        goto error;
16445
16446
32
    name_obj = PyObject_GetAttrString(codec, "name");
16447
32
    Py_CLEAR(codec);
16448
32
    if (!name_obj) {
16449
0
        goto error;
16450
0
    }
16451
16452
32
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16453
32
    Py_DECREF(name_obj);
16454
32
    if (wname == NULL) {
16455
0
        goto error;
16456
0
    }
16457
16458
32
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16459
32
    if (raw_wname == NULL) {
16460
0
        PyMem_Free(wname);
16461
0
        PyErr_NoMemory();
16462
0
        goto error;
16463
0
    }
16464
16465
32
    PyMem_RawFree(*config_encoding);
16466
32
    *config_encoding = raw_wname;
16467
16468
32
    PyMem_Free(wname);
16469
32
    return 0;
16470
16471
0
error:
16472
0
    Py_XDECREF(codec);
16473
0
    Py_XDECREF(name_obj);
16474
0
    return -1;
16475
32
}
16476
16477
16478
static PyStatus
16479
init_stdio_encoding(PyInterpreterState *interp)
16480
16
{
16481
    /* Update the stdio encoding to the normalized Python codec name. */
16482
16
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16483
16
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
16484
0
        return _PyStatus_ERR("failed to get the Python codec name "
16485
0
                             "of the stdio encoding");
16486
0
    }
16487
16
    return _PyStatus_OK();
16488
16
}
16489
16490
16491
static int
16492
init_fs_codec(PyInterpreterState *interp)
16493
16
{
16494
16
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
16495
16496
16
    _Py_error_handler error_handler;
16497
16
    error_handler = get_error_handler_wide(config->filesystem_errors);
16498
16
    if (error_handler == _Py_ERROR_UNKNOWN) {
16499
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
16500
0
        return -1;
16501
0
    }
16502
16503
16
    char *encoding, *errors;
16504
16
    if (encode_wstr_utf8(config->filesystem_encoding,
16505
16
                         &encoding,
16506
16
                         "filesystem_encoding") < 0) {
16507
0
        return -1;
16508
0
    }
16509
16510
16
    if (encode_wstr_utf8(config->filesystem_errors,
16511
16
                         &errors,
16512
16
                         "filesystem_errors") < 0) {
16513
0
        PyMem_RawFree(encoding);
16514
0
        return -1;
16515
0
    }
16516
16517
16
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16518
16
    PyMem_RawFree(fs_codec->encoding);
16519
16
    fs_codec->encoding = encoding;
16520
    /* encoding has been normalized by init_fs_encoding() */
16521
16
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16522
16
    PyMem_RawFree(fs_codec->errors);
16523
16
    fs_codec->errors = errors;
16524
16
    fs_codec->error_handler = error_handler;
16525
16526
#ifdef _Py_FORCE_UTF8_FS_ENCODING
16527
    assert(fs_codec->utf8 == 1);
16528
#endif
16529
16530
    /* At this point, PyUnicode_EncodeFSDefault() and
16531
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16532
       the C implementation of the filesystem encoding. */
16533
16534
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16535
       global configuration variables. */
16536
16
    if (_Py_IsMainInterpreter(interp)) {
16537
16538
16
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16539
16
                                      fs_codec->errors) < 0) {
16540
0
            PyErr_NoMemory();
16541
0
            return -1;
16542
0
        }
16543
16
    }
16544
16
    return 0;
16545
16
}
16546
16547
16548
static PyStatus
16549
init_fs_encoding(PyThreadState *tstate)
16550
16
{
16551
16
    PyInterpreterState *interp = tstate->interp;
16552
16553
    /* Update the filesystem encoding to the normalized Python codec name.
16554
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16555
       (Python codec name). */
16556
16
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16557
16
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
16558
0
        _Py_DumpPathConfig(tstate);
16559
0
        return _PyStatus_ERR("failed to get the Python codec "
16560
0
                             "of the filesystem encoding");
16561
0
    }
16562
16563
16
    if (init_fs_codec(interp) < 0) {
16564
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
16565
0
    }
16566
16
    return _PyStatus_OK();
16567
16
}
16568
16569
16570
PyStatus
16571
_PyUnicode_InitEncodings(PyThreadState *tstate)
16572
16
{
16573
16
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
16574
16
    if (_PyStatus_EXCEPTION(status)) {
16575
0
        return status;
16576
0
    }
16577
16
    status = init_fs_encoding(tstate);
16578
16
    if (_PyStatus_EXCEPTION(status)) {
16579
0
        return status;
16580
0
    }
16581
16582
16
    return init_stdio_encoding(tstate->interp);
16583
16
}
16584
16585
16586
static void
16587
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
16588
0
{
16589
0
    PyMem_RawFree(fs_codec->encoding);
16590
0
    fs_codec->encoding = NULL;
16591
0
    fs_codec->utf8 = 0;
16592
0
    PyMem_RawFree(fs_codec->errors);
16593
0
    fs_codec->errors = NULL;
16594
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
16595
0
}
16596
16597
16598
#ifdef MS_WINDOWS
16599
int
16600
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16601
{
16602
    PyInterpreterState *interp = _PyInterpreterState_GET();
16603
    PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
16604
16605
    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16606
    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16607
    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16608
    if (encoding == NULL || errors == NULL) {
16609
        PyMem_RawFree(encoding);
16610
        PyMem_RawFree(errors);
16611
        PyErr_NoMemory();
16612
        return -1;
16613
    }
16614
16615
    PyMem_RawFree(config->filesystem_encoding);
16616
    config->filesystem_encoding = encoding;
16617
    PyMem_RawFree(config->filesystem_errors);
16618
    config->filesystem_errors = errors;
16619
16620
    return init_fs_codec(interp);
16621
}
16622
#endif
16623
16624
16625
#ifdef Py_DEBUG
16626
static inline int
16627
unicode_is_finalizing(void)
16628
{
16629
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
16630
}
16631
#endif
16632
16633
16634
void
16635
_PyUnicode_FiniTypes(PyInterpreterState *interp)
16636
0
{
16637
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
16638
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
16639
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
16640
0
}
16641
16642
16643
void
16644
_PyUnicode_Fini(PyInterpreterState *interp)
16645
0
{
16646
0
    struct _Py_unicode_state *state = &interp->unicode;
16647
16648
0
    if (!has_shared_intern_dict(interp)) {
16649
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
16650
0
        assert(get_interned_dict(interp) == NULL);
16651
0
    }
16652
16653
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
16654
16655
    // bpo-47182: force a unicodedata CAPI capsule re-import on
16656
    // subsequent initialization of interpreter.
16657
0
    interp->unicode.ucnhash_capi = NULL;
16658
16659
0
    unicode_clear_identifiers(state);
16660
0
}
16661
16662
/* A _string module, to export formatter_parser and formatter_field_name_split
16663
   to the string.Formatter class implemented in Python. */
16664
16665
static PyMethodDef _string_methods[] = {
16666
    {"formatter_field_name_split", formatter_field_name_split,
16667
     METH_O, PyDoc_STR("split the argument as a field name")},
16668
    {"formatter_parser", formatter_parser,
16669
     METH_O, PyDoc_STR("parse the argument as a format string")},
16670
    {NULL, NULL}
16671
};
16672
16673
static PyModuleDef_Slot module_slots[] = {
16674
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
16675
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
16676
    {0, NULL}
16677
};
16678
16679
static struct PyModuleDef _string_module = {
16680
    PyModuleDef_HEAD_INIT,
16681
    .m_name = "_string",
16682
    .m_doc = PyDoc_STR("string helper module"),
16683
    .m_size = 0,
16684
    .m_methods = _string_methods,
16685
    .m_slots = module_slots,
16686
};
16687
16688
PyMODINIT_FUNC
16689
PyInit__string(void)
16690
6
{
16691
6
    return PyModuleDef_Init(&_string_module);
16692
6
}
16693
16694
16695
#undef PyUnicode_KIND
16696
int PyUnicode_KIND(PyObject *op)
16697
0
{
16698
0
    if (!PyUnicode_Check(op)) {
16699
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
16700
0
        return -1;
16701
0
    }
16702
0
    return _PyASCIIObject_CAST(op)->state.kind;
16703
0
}
16704
16705
#undef PyUnicode_DATA
16706
void* PyUnicode_DATA(PyObject *op)
16707
0
{
16708
0
    if (!PyUnicode_Check(op)) {
16709
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
16710
0
        return NULL;
16711
0
    }
16712
0
    return _PyUnicode_DATA(op);
16713
0
}