Coverage Report

Created: 2025-07-18 06:09

/src/cpython/Objects/unicodeobject.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_Repeat()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_freelist.h"      // _Py_FREELIST_FREE(), _Py_FREELIST_POP()
50
#include "pycore_initconfig.h"    // _PyStatus_OK()
51
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
52
#include "pycore_long.h"          // _PyLong_FormatWriter()
53
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
54
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
55
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
56
#include "pycore_pyhash.h"        // _Py_HashSecret_t
57
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
58
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
59
#include "pycore_template.h"      // _PyTemplate_Concat()
60
#include "pycore_tuple.h"         // _PyTuple_FromArray()
61
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
62
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
63
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
64
65
#include "stringlib/eq.h"         // unicode_eq()
66
#include <stddef.h>               // ptrdiff_t
67
68
#ifdef MS_WINDOWS
69
#include <windows.h>
70
#endif
71
72
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
73
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
74
#endif
75
76
/* Uncomment to display statistics on interned strings at exit
77
   in _PyUnicode_ClearInterned(). */
78
/* #define INTERNED_STATS 1 */
79
80
81
/*[clinic input]
82
class str "PyObject *" "&PyUnicode_Type"
83
[clinic start generated code]*/
84
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
85
86
/*[python input]
87
class Py_UCS4_converter(CConverter):
88
    type = 'Py_UCS4'
89
    converter = 'convert_uc'
90
91
    def converter_init(self):
92
        if self.default is not unspecified:
93
            self.c_default = ascii(self.default)
94
            if len(self.c_default) > 4 or self.c_default[0] != "'":
95
                self.c_default = hex(ord(self.default))
96
97
[python start generated code]*/
98
/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
99
100
/* --- Globals ------------------------------------------------------------
101
102
NOTE: In the interpreter's initialization phase, some globals are currently
103
      initialized dynamically as needed. In the process Unicode objects may
104
      be created before the Unicode type is ready.
105
106
*/
107
108
// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
109
// The value must be the same in fileutils.c.
110
86.8M
#define MAX_UNICODE 0x10ffff
111
112
#ifdef Py_DEBUG
113
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
114
#else
115
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
116
#endif
117
118
#ifdef Py_GIL_DISABLED
119
#  define LOCK_INTERNED(interp) PyMutex_Lock(&_Py_INTERP_CACHED_OBJECT(interp, interned_mutex))
120
#  define UNLOCK_INTERNED(interp) PyMutex_Unlock(&_Py_INTERP_CACHED_OBJECT(interp, interned_mutex))
121
#else
122
#  define LOCK_INTERNED(interp)
123
#  define UNLOCK_INTERNED(interp)
124
#endif
125
126
static inline char* _PyUnicode_UTF8(PyObject *op)
127
258M
{
128
258M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
129
258M
}
130
131
static inline char* PyUnicode_UTF8(PyObject *op)
132
61.4M
{
133
61.4M
    assert(_PyUnicode_CHECK(op));
134
61.4M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
135
48.3M
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
136
48.3M
    }
137
13.0M
    else {
138
13.0M
         return _PyUnicode_UTF8(op);
139
13.0M
    }
140
61.4M
}
141
142
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
143
19.8M
{
144
19.8M
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
145
19.8M
}
146
147
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
148
27.3M
{
149
27.3M
    assert(_PyUnicode_CHECK(op));
150
27.3M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
151
24.1M
         return _PyASCIIObject_CAST(op)->length;
152
24.1M
    }
153
3.23M
    else {
154
3.23M
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
155
3.23M
    }
156
27.3M
}
157
158
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
159
19.8M
{
160
19.8M
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
161
19.8M
}
162
163
#define _PyUnicode_LENGTH(op)                           \
164
587M
    (_PyASCIIObject_CAST(op)->length)
165
#define _PyUnicode_STATE(op)                            \
166
3.67G
    (_PyASCIIObject_CAST(op)->state)
167
#define _PyUnicode_HASH(op)                             \
168
534M
    (_PyASCIIObject_CAST(op)->hash)
169
170
106M
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
171
172
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
173
42.0M
{
174
42.0M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
175
42.0M
}
176
177
#define _PyUnicode_DATA_ANY(op)                         \
178
41.9M
    (_PyUnicodeObject_CAST(op)->data.any)
179
180
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
181
0
{
182
0
    assert(_PyUnicode_CHECK(op));
183
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
184
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
185
0
}
186
187
/* true if the Unicode object has an allocated UTF-8 memory block
188
   (not shared with other data) */
189
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
190
586M
{
191
586M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
192
586M
            && _PyUnicode_UTF8(op) != NULL
193
586M
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
194
586M
}
195
196
197
/* Generic helper macro to convert characters of different types.
198
   from_type and to_type have to be valid type names, begin and end
199
   are pointers to the source characters which should be of type
200
   "from_type *".  to is a pointer of type "to_type *" and points to the
201
   buffer where the result characters are written to. */
202
#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
203
174M
    do {                                                \
204
174M
        to_type *_to = (to_type *)(to);                 \
205
174M
        const from_type *_iter = (const from_type *)(begin);\
206
174M
        const from_type *_end = (const from_type *)(end);\
207
174M
        Py_ssize_t n = (_end) - (_iter);                \
208
174M
        const from_type *_unrolled_end =                \
209
174M
            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
210
1.07G
        while (_iter < (_unrolled_end)) {               \
211
901M
            _to[0] = (to_type) _iter[0];                \
212
901M
            _to[1] = (to_type) _iter[1];                \
213
901M
            _to[2] = (to_type) _iter[2];                \
214
901M
            _to[3] = (to_type) _iter[3];                \
215
901M
            _iter += 4; _to += 4;                       \
216
901M
        }                                               \
217
399M
        while (_iter < (_end))                          \
218
225M
            *_to++ = (to_type) *_iter++;                \
219
174M
    } while (0)
220
221
251M
#define LATIN1 _Py_LATIN1_CHR
222
223
#ifdef MS_WINDOWS
224
   /* On Windows, overallocate by 50% is the best factor */
225
#  define OVERALLOCATE_FACTOR 2
226
#else
227
   /* On Linux, overallocate by 25% is the best factor */
228
109M
#  define OVERALLOCATE_FACTOR 4
229
#endif
230
231
/* Forward declaration */
232
static inline int
233
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
234
static inline void
235
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
236
static PyObject *
237
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
238
                    const char *errors);
239
static PyObject *
240
unicode_decode_utf8(const char *s, Py_ssize_t size,
241
                    _Py_error_handler error_handler, const char *errors,
242
                    Py_ssize_t *consumed);
243
static int
244
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
245
                           const char *s, Py_ssize_t size,
246
                           _Py_error_handler error_handler, const char *errors,
247
                           Py_ssize_t *consumed);
248
#ifdef Py_DEBUG
249
static inline int unicode_is_finalizing(void);
250
static int unicode_is_singleton(PyObject *unicode);
251
#endif
252
253
254
// Return a reference to the immortal empty string singleton.
255
static inline PyObject* unicode_get_empty(void)
256
114M
{
257
114M
    _Py_DECLARE_STR(empty, "");
258
114M
    return &_Py_STR(empty);
259
114M
}
260
261
/* This dictionary holds per-interpreter interned strings.
262
 * See InternalDocs/string_interning.md for details.
263
 */
264
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
265
3.61M
{
266
3.61M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
267
3.61M
}
268
269
/* This hashtable holds statically allocated interned strings.
270
 * See InternalDocs/string_interning.md for details.
271
 */
272
3.29M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
273
274
/* Get number of all interned strings for the current interpreter. */
275
Py_ssize_t
276
_PyUnicode_InternedSize(void)
277
0
{
278
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
279
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
280
0
}
281
282
/* Get number of immortal interned strings for the current interpreter. */
283
Py_ssize_t
284
_PyUnicode_InternedSize_Immortal(void)
285
0
{
286
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
287
0
    PyObject *key, *value;
288
0
    Py_ssize_t pos = 0;
289
0
    Py_ssize_t count = 0;
290
291
    // It's tempting to keep a count and avoid a loop here. But, this function
292
    // is intended for refleak tests. It spends extra work to report the true
293
    // value, to help detect bugs in optimizations.
294
295
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
296
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
297
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
298
0
           count++;
299
0
       }
300
0
    }
301
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
302
0
}
303
304
static Py_hash_t unicode_hash(PyObject *);
305
306
static Py_uhash_t
307
hashtable_unicode_hash(const void *key)
308
3.29M
{
309
3.29M
    return unicode_hash((PyObject *)key);
310
3.29M
}
311
312
static int
313
hashtable_unicode_compare(const void *key1, const void *key2)
314
241k
{
315
241k
    PyObject *obj1 = (PyObject *)key1;
316
241k
    PyObject *obj2 = (PyObject *)key2;
317
241k
    if (obj1 != NULL && obj2 != NULL) {
318
241k
        return unicode_eq(obj1, obj2);
319
241k
    }
320
0
    else {
321
0
        return obj1 == obj2;
322
0
    }
323
241k
}
324
325
/* Return true if this interpreter should share the main interpreter's
326
   intern_dict.  That's important for interpreters which load basic
327
   single-phase init extension modules (m_size == -1).  There could be interned
328
   immortal strings that are shared between interpreters, due to the
329
   PyDict_Update(mdict, m_copy) call in import_find_extension().
330
331
   It's not safe to deallocate those strings until all interpreters that
332
   potentially use them are freed.  By storing them in the main interpreter, we
333
   ensure they get freed after all other interpreters are freed.
334
*/
335
static bool
336
has_shared_intern_dict(PyInterpreterState *interp)
337
16
{
338
16
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
339
16
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
340
16
}
341
342
static int
343
init_interned_dict(PyInterpreterState *interp)
344
16
{
345
16
    assert(get_interned_dict(interp) == NULL);
346
16
    PyObject *interned;
347
16
    if (has_shared_intern_dict(interp)) {
348
0
        interned = get_interned_dict(_PyInterpreterState_Main());
349
0
        Py_INCREF(interned);
350
0
    }
351
16
    else {
352
16
        interned = PyDict_New();
353
16
        if (interned == NULL) {
354
0
            return -1;
355
0
        }
356
16
    }
357
16
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
358
16
    return 0;
359
16
}
360
361
static void
362
clear_interned_dict(PyInterpreterState *interp)
363
0
{
364
0
    PyObject *interned = get_interned_dict(interp);
365
0
    if (interned != NULL) {
366
0
        if (!has_shared_intern_dict(interp)) {
367
            // only clear if the dict belongs to this interpreter
368
0
            PyDict_Clear(interned);
369
0
        }
370
0
        Py_DECREF(interned);
371
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
372
0
    }
373
0
}
374
375
static PyStatus
376
init_global_interned_strings(PyInterpreterState *interp)
377
16
{
378
16
    assert(INTERNED_STRINGS == NULL);
379
16
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
380
381
16
    INTERNED_STRINGS = _Py_hashtable_new_full(
382
16
        hashtable_unicode_hash,
383
16
        hashtable_unicode_compare,
384
        // Objects stored here are immortal and statically allocated,
385
        // so we don't need key_destroy_func & value_destroy_func:
386
16
        NULL,
387
16
        NULL,
388
16
        &hashtable_alloc
389
16
    );
390
16
    if (INTERNED_STRINGS == NULL) {
391
0
        PyErr_Clear();
392
0
        return _PyStatus_ERR("failed to create global interned dict");
393
0
    }
394
395
    /* Intern statically allocated string identifiers, deepfreeze strings,
396
        * and one-byte latin-1 strings.
397
        * This must be done before any module initialization so that statically
398
        * allocated string identifiers are used instead of heap allocated strings.
399
        * Deepfreeze uses the interned identifiers if present to save space
400
        * else generates them and they are interned to speed up dict lookups.
401
    */
402
16
    _PyUnicode_InitStaticStrings(interp);
403
404
4.11k
    for (int i = 0; i < 256; i++) {
405
4.09k
        PyObject *s = LATIN1(i);
406
4.09k
        _PyUnicode_InternStatic(interp, &s);
407
4.09k
        assert(s == LATIN1(i));
408
4.09k
    }
409
#ifdef Py_DEBUG
410
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
411
412
    for (int i = 0; i < 256; i++) {
413
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
414
    }
415
#endif
416
16
    return _PyStatus_OK();
417
16
}
418
419
static void clear_global_interned_strings(void)
420
0
{
421
0
    if (INTERNED_STRINGS != NULL) {
422
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
423
0
        INTERNED_STRINGS = NULL;
424
0
    }
425
0
}
426
427
#define _Py_RETURN_UNICODE_EMPTY()   \
428
48.4M
    do {                             \
429
48.4M
        return unicode_get_empty();  \
430
48.4M
    } while (0)
431
432
static inline void
433
unicode_fill(int kind, void *data, Py_UCS4 value,
434
             Py_ssize_t start, Py_ssize_t length)
435
11.9M
{
436
11.9M
    assert(0 <= start);
437
11.9M
    switch (kind) {
438
3.43M
    case PyUnicode_1BYTE_KIND: {
439
3.43M
        assert(value <= 0xff);
440
3.43M
        Py_UCS1 ch = (unsigned char)value;
441
3.43M
        Py_UCS1 *to = (Py_UCS1 *)data + start;
442
3.43M
        memset(to, ch, length);
443
3.43M
        break;
444
0
    }
445
5.69M
    case PyUnicode_2BYTE_KIND: {
446
5.69M
        assert(value <= 0xffff);
447
5.69M
        Py_UCS2 ch = (Py_UCS2)value;
448
5.69M
        Py_UCS2 *to = (Py_UCS2 *)data + start;
449
5.69M
        const Py_UCS2 *end = to + length;
450
50.7M
        for (; to < end; ++to) *to = ch;
451
5.69M
        break;
452
0
    }
453
2.83M
    case PyUnicode_4BYTE_KIND: {
454
2.83M
        assert(value <= MAX_UNICODE);
455
2.83M
        Py_UCS4 ch = value;
456
2.83M
        Py_UCS4 * to = (Py_UCS4 *)data + start;
457
2.83M
        const Py_UCS4 *end = to + length;
458
24.6M
        for (; to < end; ++to) *to = ch;
459
2.83M
        break;
460
0
    }
461
0
    default: Py_UNREACHABLE();
462
11.9M
    }
463
11.9M
}
464
465
466
/* Fast detection of the most frequent whitespace characters */
467
const unsigned char _Py_ascii_whitespace[] = {
468
    0, 0, 0, 0, 0, 0, 0, 0,
469
/*     case 0x0009: * CHARACTER TABULATION */
470
/*     case 0x000A: * LINE FEED */
471
/*     case 0x000B: * LINE TABULATION */
472
/*     case 0x000C: * FORM FEED */
473
/*     case 0x000D: * CARRIAGE RETURN */
474
    0, 1, 1, 1, 1, 1, 0, 0,
475
    0, 0, 0, 0, 0, 0, 0, 0,
476
/*     case 0x001C: * FILE SEPARATOR */
477
/*     case 0x001D: * GROUP SEPARATOR */
478
/*     case 0x001E: * RECORD SEPARATOR */
479
/*     case 0x001F: * UNIT SEPARATOR */
480
    0, 0, 0, 0, 1, 1, 1, 1,
481
/*     case 0x0020: * SPACE */
482
    1, 0, 0, 0, 0, 0, 0, 0,
483
    0, 0, 0, 0, 0, 0, 0, 0,
484
    0, 0, 0, 0, 0, 0, 0, 0,
485
    0, 0, 0, 0, 0, 0, 0, 0,
486
487
    0, 0, 0, 0, 0, 0, 0, 0,
488
    0, 0, 0, 0, 0, 0, 0, 0,
489
    0, 0, 0, 0, 0, 0, 0, 0,
490
    0, 0, 0, 0, 0, 0, 0, 0,
491
    0, 0, 0, 0, 0, 0, 0, 0,
492
    0, 0, 0, 0, 0, 0, 0, 0,
493
    0, 0, 0, 0, 0, 0, 0, 0,
494
    0, 0, 0, 0, 0, 0, 0, 0
495
};
496
497
/* forward */
498
static PyObject* get_latin1_char(unsigned char ch);
499
static int unicode_modifiable(PyObject *unicode);
500
501
502
static PyObject *
503
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
504
static PyObject *
505
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
506
static PyObject *
507
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
508
509
static PyObject *
510
unicode_encode_call_errorhandler(const char *errors,
511
       PyObject **errorHandler,const char *encoding, const char *reason,
512
       PyObject *unicode, PyObject **exceptionObject,
513
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
514
515
static void
516
raise_encode_exception(PyObject **exceptionObject,
517
                       const char *encoding,
518
                       PyObject *unicode,
519
                       Py_ssize_t startpos, Py_ssize_t endpos,
520
                       const char *reason);
521
522
/* Same for linebreaks */
523
static const unsigned char ascii_linebreak[] = {
524
    0, 0, 0, 0, 0, 0, 0, 0,
525
/*         0x000A, * LINE FEED */
526
/*         0x000B, * LINE TABULATION */
527
/*         0x000C, * FORM FEED */
528
/*         0x000D, * CARRIAGE RETURN */
529
    0, 0, 1, 1, 1, 1, 0, 0,
530
    0, 0, 0, 0, 0, 0, 0, 0,
531
/*         0x001C, * FILE SEPARATOR */
532
/*         0x001D, * GROUP SEPARATOR */
533
/*         0x001E, * RECORD SEPARATOR */
534
    0, 0, 0, 0, 1, 1, 1, 0,
535
    0, 0, 0, 0, 0, 0, 0, 0,
536
    0, 0, 0, 0, 0, 0, 0, 0,
537
    0, 0, 0, 0, 0, 0, 0, 0,
538
    0, 0, 0, 0, 0, 0, 0, 0,
539
540
    0, 0, 0, 0, 0, 0, 0, 0,
541
    0, 0, 0, 0, 0, 0, 0, 0,
542
    0, 0, 0, 0, 0, 0, 0, 0,
543
    0, 0, 0, 0, 0, 0, 0, 0,
544
    0, 0, 0, 0, 0, 0, 0, 0,
545
    0, 0, 0, 0, 0, 0, 0, 0,
546
    0, 0, 0, 0, 0, 0, 0, 0,
547
    0, 0, 0, 0, 0, 0, 0, 0
548
};
549
550
static int convert_uc(PyObject *obj, void *addr);
551
552
struct encoding_map;
553
#include "clinic/unicodeobject.c.h"
554
555
_Py_error_handler
556
_Py_GetErrorHandler(const char *errors)
557
517k
{
558
517k
    if (errors == NULL || strcmp(errors, "strict") == 0) {
559
191k
        return _Py_ERROR_STRICT;
560
191k
    }
561
326k
    if (strcmp(errors, "surrogateescape") == 0) {
562
169k
        return _Py_ERROR_SURROGATEESCAPE;
563
169k
    }
564
157k
    if (strcmp(errors, "replace") == 0) {
565
157k
        return _Py_ERROR_REPLACE;
566
157k
    }
567
0
    if (strcmp(errors, "ignore") == 0) {
568
0
        return _Py_ERROR_IGNORE;
569
0
    }
570
0
    if (strcmp(errors, "backslashreplace") == 0) {
571
0
        return _Py_ERROR_BACKSLASHREPLACE;
572
0
    }
573
0
    if (strcmp(errors, "surrogatepass") == 0) {
574
0
        return _Py_ERROR_SURROGATEPASS;
575
0
    }
576
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
577
0
        return _Py_ERROR_XMLCHARREFREPLACE;
578
0
    }
579
0
    return _Py_ERROR_OTHER;
580
0
}
581
582
583
static _Py_error_handler
584
get_error_handler_wide(const wchar_t *errors)
585
5.55k
{
586
5.55k
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
587
0
        return _Py_ERROR_STRICT;
588
0
    }
589
5.55k
    if (wcscmp(errors, L"surrogateescape") == 0) {
590
5.55k
        return _Py_ERROR_SURROGATEESCAPE;
591
5.55k
    }
592
0
    if (wcscmp(errors, L"replace") == 0) {
593
0
        return _Py_ERROR_REPLACE;
594
0
    }
595
0
    if (wcscmp(errors, L"ignore") == 0) {
596
0
        return _Py_ERROR_IGNORE;
597
0
    }
598
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
599
0
        return _Py_ERROR_BACKSLASHREPLACE;
600
0
    }
601
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
602
0
        return _Py_ERROR_SURROGATEPASS;
603
0
    }
604
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
605
0
        return _Py_ERROR_XMLCHARREFREPLACE;
606
0
    }
607
0
    return _Py_ERROR_OTHER;
608
0
}
609
610
611
static inline int
612
unicode_check_encoding_errors(const char *encoding, const char *errors)
613
21.2M
{
614
21.2M
    if (encoding == NULL && errors == NULL) {
615
11.6M
        return 0;
616
11.6M
    }
617
618
9.61M
    PyInterpreterState *interp = _PyInterpreterState_GET();
619
9.61M
#ifndef Py_DEBUG
620
    /* In release mode, only check in development mode (-X dev) */
621
9.61M
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
622
9.61M
        return 0;
623
9.61M
    }
624
#else
625
    /* Always check in debug mode */
626
#endif
627
628
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
629
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
630
0
    if (!interp->unicode.fs_codec.encoding) {
631
0
        return 0;
632
0
    }
633
634
    /* Disable checks during Python finalization. For example, it allows to
635
       call _PyObject_Dump() during finalization for debugging purpose. */
636
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
637
0
        return 0;
638
0
    }
639
640
0
    if (encoding != NULL
641
        // Fast path for the most common built-in encodings. Even if the codec
642
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
643
        // create a temporary Unicode string (the key in the cache).
644
0
        && strcmp(encoding, "utf-8") != 0
645
0
        && strcmp(encoding, "utf8") != 0
646
0
        && strcmp(encoding, "ascii") != 0)
647
0
    {
648
0
        PyObject *handler = _PyCodec_Lookup(encoding);
649
0
        if (handler == NULL) {
650
0
            return -1;
651
0
        }
652
0
        Py_DECREF(handler);
653
0
    }
654
655
0
    if (errors != NULL
656
        // Fast path for the most common built-in error handlers.
657
0
        && strcmp(errors, "strict") != 0
658
0
        && strcmp(errors, "ignore") != 0
659
0
        && strcmp(errors, "replace") != 0
660
0
        && strcmp(errors, "surrogateescape") != 0
661
0
        && strcmp(errors, "surrogatepass") != 0)
662
0
    {
663
0
        PyObject *handler = PyCodec_LookupError(errors);
664
0
        if (handler == NULL) {
665
0
            return -1;
666
0
        }
667
0
        Py_DECREF(handler);
668
0
    }
669
0
    return 0;
670
0
}
671
672
673
int
674
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
675
0
{
676
0
#define CHECK(expr) \
677
0
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
678
679
0
    assert(op != NULL);
680
0
    CHECK(PyUnicode_Check(op));
681
682
0
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
683
0
    int kind = ascii->state.kind;
684
685
0
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
686
0
        CHECK(kind == PyUnicode_1BYTE_KIND);
687
0
    }
688
0
    else {
689
0
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
690
0
        void *data;
691
692
0
        if (ascii->state.compact == 1) {
693
0
            data = compact + 1;
694
0
            CHECK(kind == PyUnicode_1BYTE_KIND
695
0
                                 || kind == PyUnicode_2BYTE_KIND
696
0
                                 || kind == PyUnicode_4BYTE_KIND);
697
0
            CHECK(ascii->state.ascii == 0);
698
0
            CHECK(_PyUnicode_UTF8(op) != data);
699
0
        }
700
0
        else {
701
0
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
702
703
0
            data = unicode->data.any;
704
0
            CHECK(kind == PyUnicode_1BYTE_KIND
705
0
                     || kind == PyUnicode_2BYTE_KIND
706
0
                     || kind == PyUnicode_4BYTE_KIND);
707
0
            CHECK(ascii->state.compact == 0);
708
0
            CHECK(data != NULL);
709
0
            if (ascii->state.ascii) {
710
0
                CHECK(_PyUnicode_UTF8(op) == data);
711
0
                CHECK(compact->utf8_length == ascii->length);
712
0
            }
713
0
            else {
714
0
                CHECK(_PyUnicode_UTF8(op) != data);
715
0
            }
716
0
        }
717
0
#ifndef Py_GIL_DISABLED
718
0
        if (_PyUnicode_UTF8(op) == NULL)
719
0
            CHECK(compact->utf8_length == 0);
720
0
#endif
721
0
    }
722
723
    /* check that the best kind is used: O(n) operation */
724
0
    if (check_content) {
725
0
        Py_ssize_t i;
726
0
        Py_UCS4 maxchar = 0;
727
0
        const void *data;
728
0
        Py_UCS4 ch;
729
730
0
        data = PyUnicode_DATA(ascii);
731
0
        for (i=0; i < ascii->length; i++)
732
0
        {
733
0
            ch = PyUnicode_READ(kind, data, i);
734
0
            if (ch > maxchar)
735
0
                maxchar = ch;
736
0
        }
737
0
        if (kind == PyUnicode_1BYTE_KIND) {
738
0
            if (ascii->state.ascii == 0) {
739
0
                CHECK(maxchar >= 128);
740
0
                CHECK(maxchar <= 255);
741
0
            }
742
0
            else
743
0
                CHECK(maxchar < 128);
744
0
        }
745
0
        else if (kind == PyUnicode_2BYTE_KIND) {
746
0
            CHECK(maxchar >= 0x100);
747
0
            CHECK(maxchar <= 0xFFFF);
748
0
        }
749
0
        else {
750
0
            CHECK(maxchar >= 0x10000);
751
0
            CHECK(maxchar <= MAX_UNICODE);
752
0
        }
753
0
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
754
0
    }
755
756
    /* Check interning state */
757
#ifdef Py_DEBUG
758
    // Note that we do not check `_Py_IsImmortal(op)`, since stable ABI
759
    // extensions can make immortal strings mortal (but with a high enough
760
    // refcount).
761
    // The other way is extremely unlikely (worth a potential failed assertion
762
    // in a debug build), so we do check `!_Py_IsImmortal(op)`.
763
    switch (PyUnicode_CHECK_INTERNED(op)) {
764
        case SSTATE_NOT_INTERNED:
765
            if (ascii->state.statically_allocated) {
766
                // This state is for two exceptions:
767
                // - strings are currently checked before they're interned
768
                // - the 256 one-latin1-character strings
769
                //   are static but use SSTATE_NOT_INTERNED
770
            }
771
            else {
772
                CHECK(!_Py_IsImmortal(op));
773
            }
774
            break;
775
        case SSTATE_INTERNED_MORTAL:
776
            CHECK(!ascii->state.statically_allocated);
777
            CHECK(!_Py_IsImmortal(op));
778
            break;
779
        case SSTATE_INTERNED_IMMORTAL:
780
            CHECK(!ascii->state.statically_allocated);
781
            break;
782
        case SSTATE_INTERNED_IMMORTAL_STATIC:
783
            CHECK(ascii->state.statically_allocated);
784
            break;
785
        default:
786
            Py_UNREACHABLE();
787
    }
788
#endif
789
790
0
    return 1;
791
792
0
#undef CHECK
793
0
}
794
795
static PyObject*
796
unicode_result(PyObject *unicode)
797
51.9M
{
798
51.9M
    assert(_PyUnicode_CHECK(unicode));
799
800
51.9M
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
801
51.9M
    if (length == 0) {
802
183
        PyObject *empty = unicode_get_empty();
803
183
        if (unicode != empty) {
804
0
            Py_DECREF(unicode);
805
0
        }
806
183
        return empty;
807
183
    }
808
809
51.9M
    if (length == 1) {
810
301k
        int kind = PyUnicode_KIND(unicode);
811
301k
        if (kind == PyUnicode_1BYTE_KIND) {
812
87.8k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
813
87.8k
            Py_UCS1 ch = data[0];
814
87.8k
            PyObject *latin1_char = LATIN1(ch);
815
87.8k
            if (unicode != latin1_char) {
816
82.5k
                Py_DECREF(unicode);
817
82.5k
            }
818
87.8k
            return latin1_char;
819
87.8k
        }
820
301k
    }
821
822
51.8M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
823
51.8M
    return unicode;
824
51.9M
}
825
826
static PyObject*
827
unicode_result_unchanged(PyObject *unicode)
828
155M
{
829
155M
    if (PyUnicode_CheckExact(unicode)) {
830
151M
        return Py_NewRef(unicode);
831
151M
    }
832
3.22M
    else
833
        /* Subtype -- return genuine unicode string with the same value. */
834
3.22M
        return _PyUnicode_Copy(unicode);
835
155M
}
836
837
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
838
   ASCII, Latin1, UTF-8, etc. */
839
static char*
840
backslashreplace(_PyBytesWriter *writer, char *str,
841
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
842
0
{
843
0
    Py_ssize_t size, i;
844
0
    Py_UCS4 ch;
845
0
    int kind;
846
0
    const void *data;
847
848
0
    kind = PyUnicode_KIND(unicode);
849
0
    data = PyUnicode_DATA(unicode);
850
851
0
    size = 0;
852
    /* determine replacement size */
853
0
    for (i = collstart; i < collend; ++i) {
854
0
        Py_ssize_t incr;
855
856
0
        ch = PyUnicode_READ(kind, data, i);
857
0
        if (ch < 0x100)
858
0
            incr = 2+2;
859
0
        else if (ch < 0x10000)
860
0
            incr = 2+4;
861
0
        else {
862
0
            assert(ch <= MAX_UNICODE);
863
0
            incr = 2+8;
864
0
        }
865
0
        if (size > PY_SSIZE_T_MAX - incr) {
866
0
            PyErr_SetString(PyExc_OverflowError,
867
0
                            "encoded result is too long for a Python string");
868
0
            return NULL;
869
0
        }
870
0
        size += incr;
871
0
    }
872
873
0
    str = _PyBytesWriter_Prepare(writer, str, size);
874
0
    if (str == NULL)
875
0
        return NULL;
876
877
    /* generate replacement */
878
0
    for (i = collstart; i < collend; ++i) {
879
0
        ch = PyUnicode_READ(kind, data, i);
880
0
        *str++ = '\\';
881
0
        if (ch >= 0x00010000) {
882
0
            *str++ = 'U';
883
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
884
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
885
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
886
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
887
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
888
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
889
0
        }
890
0
        else if (ch >= 0x100) {
891
0
            *str++ = 'u';
892
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
893
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
894
0
        }
895
0
        else
896
0
            *str++ = 'x';
897
0
        *str++ = Py_hexdigits[(ch>>4)&0xf];
898
0
        *str++ = Py_hexdigits[ch&0xf];
899
0
    }
900
0
    return str;
901
0
}
902
903
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
904
   ASCII, Latin1, UTF-8, etc. */
905
static char*
906
xmlcharrefreplace(_PyBytesWriter *writer, char *str,
907
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
908
0
{
909
0
    Py_ssize_t size, i;
910
0
    Py_UCS4 ch;
911
0
    int kind;
912
0
    const void *data;
913
914
0
    kind = PyUnicode_KIND(unicode);
915
0
    data = PyUnicode_DATA(unicode);
916
917
0
    size = 0;
918
    /* determine replacement size */
919
0
    for (i = collstart; i < collend; ++i) {
920
0
        Py_ssize_t incr;
921
922
0
        ch = PyUnicode_READ(kind, data, i);
923
0
        if (ch < 10)
924
0
            incr = 2+1+1;
925
0
        else if (ch < 100)
926
0
            incr = 2+2+1;
927
0
        else if (ch < 1000)
928
0
            incr = 2+3+1;
929
0
        else if (ch < 10000)
930
0
            incr = 2+4+1;
931
0
        else if (ch < 100000)
932
0
            incr = 2+5+1;
933
0
        else if (ch < 1000000)
934
0
            incr = 2+6+1;
935
0
        else {
936
0
            assert(ch <= MAX_UNICODE);
937
0
            incr = 2+7+1;
938
0
        }
939
0
        if (size > PY_SSIZE_T_MAX - incr) {
940
0
            PyErr_SetString(PyExc_OverflowError,
941
0
                            "encoded result is too long for a Python string");
942
0
            return NULL;
943
0
        }
944
0
        size += incr;
945
0
    }
946
947
0
    str = _PyBytesWriter_Prepare(writer, str, size);
948
0
    if (str == NULL)
949
0
        return NULL;
950
951
    /* generate replacement */
952
0
    for (i = collstart; i < collend; ++i) {
953
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
954
0
        if (size < 0) {
955
0
            return NULL;
956
0
        }
957
0
        str += size;
958
0
    }
959
0
    return str;
960
0
}
961
962
/* --- Bloom Filters ----------------------------------------------------- */
963
964
/* stuff to implement simple "bloom filters" for Unicode characters.
965
   to keep things simple, we use a single bitmask, using the least 5
966
   bits from each unicode characters as the bit index. */
967
968
/* the linebreak mask is set up by _PyUnicode_Init() below */
969
970
#if LONG_BIT >= 128
971
#define BLOOM_WIDTH 128
972
#elif LONG_BIT >= 64
973
39.0M
#define BLOOM_WIDTH 64
974
#elif LONG_BIT >= 32
975
#define BLOOM_WIDTH 32
976
#else
977
#error "LONG_BIT is smaller than 32"
978
#endif
979
980
17.4M
#define BLOOM_MASK unsigned long
981
982
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
983
984
53.4M
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
985
986
#define BLOOM_LINEBREAK(ch)                                             \
987
229M
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
988
229M
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
989
990
static inline BLOOM_MASK
991
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
992
8.74M
{
993
8.74M
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
994
8.74M
    do {                                               \
995
8.74M
        TYPE *data = (TYPE *)PTR;                      \
996
8.74M
        TYPE *end = data + LEN;                        \
997
8.74M
        Py_UCS4 ch;                                    \
998
19.2M
        for (; data != end; data++) {                  \
999
10.5M
            ch = *data;                                \
1000
10.5M
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
1001
10.5M
        }                                              \
1002
8.74M
        break;                                         \
1003
8.74M
    } while (0)
1004
1005
    /* calculate simple bloom-style bitmask for a given unicode string */
1006
1007
8.74M
    BLOOM_MASK mask;
1008
1009
8.74M
    mask = 0;
1010
8.74M
    switch (kind) {
1011
8.74M
    case PyUnicode_1BYTE_KIND:
1012
8.74M
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
1013
0
        break;
1014
16
    case PyUnicode_2BYTE_KIND:
1015
16
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
1016
0
        break;
1017
0
    case PyUnicode_4BYTE_KIND:
1018
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
1019
0
        break;
1020
0
    default:
1021
0
        Py_UNREACHABLE();
1022
8.74M
    }
1023
8.74M
    return mask;
1024
1025
8.74M
#undef BLOOM_UPDATE
1026
8.74M
}
1027
1028
static int
1029
ensure_unicode(PyObject *obj)
1030
181M
{
1031
181M
    if (!PyUnicode_Check(obj)) {
1032
0
        PyErr_Format(PyExc_TypeError,
1033
0
                     "must be str, not %.100s",
1034
0
                     Py_TYPE(obj)->tp_name);
1035
0
        return -1;
1036
0
    }
1037
181M
    return 0;
1038
181M
}
1039
1040
/* Compilation of templated routines */
1041
1042
1.13M
#define STRINGLIB_GET_EMPTY() unicode_get_empty()
1043
1044
#include "stringlib/asciilib.h"
1045
#include "stringlib/fastsearch.h"
1046
#include "stringlib/partition.h"
1047
#include "stringlib/split.h"
1048
#include "stringlib/count.h"
1049
#include "stringlib/find.h"
1050
#include "stringlib/find_max_char.h"
1051
#include "stringlib/undef.h"
1052
1053
#include "stringlib/ucs1lib.h"
1054
#include "stringlib/fastsearch.h"
1055
#include "stringlib/partition.h"
1056
#include "stringlib/split.h"
1057
#include "stringlib/count.h"
1058
#include "stringlib/find.h"
1059
#include "stringlib/replace.h"
1060
#include "stringlib/repr.h"
1061
#include "stringlib/find_max_char.h"
1062
#include "stringlib/undef.h"
1063
1064
#include "stringlib/ucs2lib.h"
1065
#include "stringlib/fastsearch.h"
1066
#include "stringlib/partition.h"
1067
#include "stringlib/split.h"
1068
#include "stringlib/count.h"
1069
#include "stringlib/find.h"
1070
#include "stringlib/replace.h"
1071
#include "stringlib/repr.h"
1072
#include "stringlib/find_max_char.h"
1073
#include "stringlib/undef.h"
1074
1075
#include "stringlib/ucs4lib.h"
1076
#include "stringlib/fastsearch.h"
1077
#include "stringlib/partition.h"
1078
#include "stringlib/split.h"
1079
#include "stringlib/count.h"
1080
#include "stringlib/find.h"
1081
#include "stringlib/replace.h"
1082
#include "stringlib/repr.h"
1083
#include "stringlib/find_max_char.h"
1084
#include "stringlib/undef.h"
1085
1086
#undef STRINGLIB_GET_EMPTY
1087
1088
/* --- Unicode Object ----------------------------------------------------- */
1089
1090
static inline Py_ssize_t
1091
findchar(const void *s, int kind,
1092
         Py_ssize_t size, Py_UCS4 ch,
1093
         int direction)
1094
112M
{
1095
112M
    switch (kind) {
1096
105M
    case PyUnicode_1BYTE_KIND:
1097
105M
        if ((Py_UCS1) ch != ch)
1098
3.49k
            return -1;
1099
105M
        if (direction > 0)
1100
105M
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1101
7.35k
        else
1102
7.35k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1103
6.25M
    case PyUnicode_2BYTE_KIND:
1104
6.25M
        if ((Py_UCS2) ch != ch)
1105
0
            return -1;
1106
6.25M
        if (direction > 0)
1107
6.20M
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1108
44.2k
        else
1109
44.2k
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1110
836k
    case PyUnicode_4BYTE_KIND:
1111
836k
        if (direction > 0)
1112
682k
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1113
153k
        else
1114
153k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1115
0
    default:
1116
0
        Py_UNREACHABLE();
1117
112M
    }
1118
112M
}
1119
1120
#ifdef Py_DEBUG
1121
/* Fill the data of a Unicode string with invalid characters to detect bugs
1122
   earlier.
1123
1124
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1125
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1126
   invalid character in Unicode 6.0. */
1127
static void
1128
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1129
{
1130
    int kind = PyUnicode_KIND(unicode);
1131
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1132
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1133
    if (length <= old_length)
1134
        return;
1135
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1136
}
1137
#endif
1138
1139
static PyObject*
1140
resize_copy(PyObject *unicode, Py_ssize_t length)
1141
0
{
1142
0
    Py_ssize_t copy_length;
1143
0
    PyObject *copy;
1144
1145
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1146
0
    if (copy == NULL)
1147
0
        return NULL;
1148
1149
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1150
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1151
0
    return copy;
1152
0
}
1153
1154
static PyObject*
1155
resize_compact(PyObject *unicode, Py_ssize_t length)
1156
63.0M
{
1157
63.0M
    Py_ssize_t char_size;
1158
63.0M
    Py_ssize_t struct_size;
1159
63.0M
    Py_ssize_t new_size;
1160
63.0M
    PyObject *new_unicode;
1161
#ifdef Py_DEBUG
1162
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1163
#endif
1164
1165
63.0M
    if (!unicode_modifiable(unicode)) {
1166
0
        PyObject *copy = resize_copy(unicode, length);
1167
0
        if (copy == NULL) {
1168
0
            return NULL;
1169
0
        }
1170
0
        Py_DECREF(unicode);
1171
0
        return copy;
1172
0
    }
1173
63.0M
    assert(PyUnicode_IS_COMPACT(unicode));
1174
1175
63.0M
    char_size = PyUnicode_KIND(unicode);
1176
63.0M
    if (PyUnicode_IS_ASCII(unicode))
1177
54.4M
        struct_size = sizeof(PyASCIIObject);
1178
8.56M
    else
1179
8.56M
        struct_size = sizeof(PyCompactUnicodeObject);
1180
1181
63.0M
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1182
0
        PyErr_NoMemory();
1183
0
        return NULL;
1184
0
    }
1185
63.0M
    new_size = (struct_size + (length + 1) * char_size);
1186
1187
63.0M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1188
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1189
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1190
0
        PyUnicode_SET_UTF8(unicode, NULL);
1191
0
    }
1192
#ifdef Py_TRACE_REFS
1193
    _Py_ForgetReference(unicode);
1194
#endif
1195
63.0M
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1196
1197
63.0M
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1198
63.0M
    if (new_unicode == NULL) {
1199
0
        _Py_NewReferenceNoTotal(unicode);
1200
0
        PyErr_NoMemory();
1201
0
        return NULL;
1202
0
    }
1203
63.0M
    unicode = new_unicode;
1204
63.0M
    _Py_NewReferenceNoTotal(unicode);
1205
1206
63.0M
    _PyUnicode_LENGTH(unicode) = length;
1207
#ifdef Py_DEBUG
1208
    unicode_fill_invalid(unicode, old_length);
1209
#endif
1210
63.0M
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1211
63.0M
                    length, 0);
1212
63.0M
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1213
63.0M
    return unicode;
1214
63.0M
}
1215
1216
static int
1217
resize_inplace(PyObject *unicode, Py_ssize_t length)
1218
0
{
1219
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1220
0
    assert(Py_REFCNT(unicode) == 1);
1221
1222
0
    Py_ssize_t new_size;
1223
0
    Py_ssize_t char_size;
1224
0
    int share_utf8;
1225
0
    void *data;
1226
#ifdef Py_DEBUG
1227
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1228
#endif
1229
1230
0
    data = _PyUnicode_DATA_ANY(unicode);
1231
0
    char_size = PyUnicode_KIND(unicode);
1232
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1233
1234
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1235
0
        PyErr_NoMemory();
1236
0
        return -1;
1237
0
    }
1238
0
    new_size = (length + 1) * char_size;
1239
1240
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1241
0
    {
1242
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1243
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1244
0
        PyUnicode_SET_UTF8(unicode, NULL);
1245
0
    }
1246
1247
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1248
0
    if (data == NULL) {
1249
0
        PyErr_NoMemory();
1250
0
        return -1;
1251
0
    }
1252
0
    _PyUnicode_DATA_ANY(unicode) = data;
1253
0
    if (share_utf8) {
1254
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1255
0
        PyUnicode_SET_UTF8(unicode, data);
1256
0
    }
1257
0
    _PyUnicode_LENGTH(unicode) = length;
1258
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1259
#ifdef Py_DEBUG
1260
    unicode_fill_invalid(unicode, old_length);
1261
#endif
1262
1263
    /* check for integer overflow */
1264
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1265
0
        PyErr_NoMemory();
1266
0
        return -1;
1267
0
    }
1268
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1269
0
    return 0;
1270
0
}
1271
1272
static const char*
1273
unicode_kind_name(PyObject *unicode)
1274
0
{
1275
    /* don't check consistency: unicode_kind_name() is called from
1276
       _PyUnicode_Dump() */
1277
0
    if (!PyUnicode_IS_COMPACT(unicode))
1278
0
    {
1279
0
        switch (PyUnicode_KIND(unicode))
1280
0
        {
1281
0
        case PyUnicode_1BYTE_KIND:
1282
0
            if (PyUnicode_IS_ASCII(unicode))
1283
0
                return "legacy ascii";
1284
0
            else
1285
0
                return "legacy latin1";
1286
0
        case PyUnicode_2BYTE_KIND:
1287
0
            return "legacy UCS2";
1288
0
        case PyUnicode_4BYTE_KIND:
1289
0
            return "legacy UCS4";
1290
0
        default:
1291
0
            return "<legacy invalid kind>";
1292
0
        }
1293
0
    }
1294
0
    switch (PyUnicode_KIND(unicode)) {
1295
0
    case PyUnicode_1BYTE_KIND:
1296
0
        if (PyUnicode_IS_ASCII(unicode))
1297
0
            return "ascii";
1298
0
        else
1299
0
            return "latin1";
1300
0
    case PyUnicode_2BYTE_KIND:
1301
0
        return "UCS2";
1302
0
    case PyUnicode_4BYTE_KIND:
1303
0
        return "UCS4";
1304
0
    default:
1305
0
        return "<invalid compact kind>";
1306
0
    }
1307
0
}
1308
1309
#ifdef Py_DEBUG
1310
/* Functions wrapping macros for use in debugger */
1311
const char *_PyUnicode_utf8(void *unicode_raw){
1312
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1313
    return PyUnicode_UTF8(unicode);
1314
}
1315
1316
const void *_PyUnicode_compact_data(void *unicode_raw) {
1317
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1318
    return _PyUnicode_COMPACT_DATA(unicode);
1319
}
1320
const void *_PyUnicode_data(void *unicode_raw) {
1321
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1322
    printf("obj %p\n", (void*)unicode);
1323
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1324
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1325
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1326
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1327
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1328
    return PyUnicode_DATA(unicode);
1329
}
1330
1331
void
1332
_PyUnicode_Dump(PyObject *op)
1333
{
1334
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1335
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1336
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1337
    const void *data;
1338
1339
    if (ascii->state.compact)
1340
    {
1341
        if (ascii->state.ascii)
1342
            data = (ascii + 1);
1343
        else
1344
            data = (compact + 1);
1345
    }
1346
    else
1347
        data = unicode->data.any;
1348
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1349
1350
    if (!ascii->state.ascii) {
1351
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1352
    }
1353
    printf(", data=%p\n", data);
1354
}
1355
#endif
1356
1357
1358
PyObject *
1359
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1360
538M
{
1361
    /* Optimization for empty strings */
1362
538M
    if (size == 0) {
1363
25.3M
        return unicode_get_empty();
1364
25.3M
    }
1365
1366
513M
    PyObject *obj;
1367
513M
    PyCompactUnicodeObject *unicode;
1368
513M
    void *data;
1369
513M
    int kind;
1370
513M
    int is_ascii;
1371
513M
    Py_ssize_t char_size;
1372
513M
    Py_ssize_t struct_size;
1373
1374
513M
    is_ascii = 0;
1375
513M
    struct_size = sizeof(PyCompactUnicodeObject);
1376
513M
    if (maxchar < 128) {
1377
296M
        kind = PyUnicode_1BYTE_KIND;
1378
296M
        char_size = 1;
1379
296M
        is_ascii = 1;
1380
296M
        struct_size = sizeof(PyASCIIObject);
1381
296M
    }
1382
216M
    else if (maxchar < 256) {
1383
25.4M
        kind = PyUnicode_1BYTE_KIND;
1384
25.4M
        char_size = 1;
1385
25.4M
    }
1386
191M
    else if (maxchar < 65536) {
1387
183M
        kind = PyUnicode_2BYTE_KIND;
1388
183M
        char_size = 2;
1389
183M
    }
1390
8.19M
    else {
1391
8.19M
        if (maxchar > MAX_UNICODE) {
1392
0
            PyErr_SetString(PyExc_SystemError,
1393
0
                            "invalid maximum character passed to PyUnicode_New");
1394
0
            return NULL;
1395
0
        }
1396
8.19M
        kind = PyUnicode_4BYTE_KIND;
1397
8.19M
        char_size = 4;
1398
8.19M
    }
1399
1400
    /* Ensure we won't overflow the size. */
1401
513M
    if (size < 0) {
1402
0
        PyErr_SetString(PyExc_SystemError,
1403
0
                        "Negative size passed to PyUnicode_New");
1404
0
        return NULL;
1405
0
    }
1406
513M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1407
0
        return PyErr_NoMemory();
1408
1409
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1410
     * PyObject_New() so we are able to allocate space for the object and
1411
     * it's data buffer.
1412
     */
1413
513M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1414
513M
    if (obj == NULL) {
1415
0
        return PyErr_NoMemory();
1416
0
    }
1417
513M
    _PyObject_Init(obj, &PyUnicode_Type);
1418
1419
513M
    unicode = (PyCompactUnicodeObject *)obj;
1420
513M
    if (is_ascii)
1421
296M
        data = ((PyASCIIObject*)obj) + 1;
1422
216M
    else
1423
216M
        data = unicode + 1;
1424
513M
    _PyUnicode_LENGTH(unicode) = size;
1425
513M
    _PyUnicode_HASH(unicode) = -1;
1426
513M
    _PyUnicode_STATE(unicode).interned = 0;
1427
513M
    _PyUnicode_STATE(unicode).kind = kind;
1428
513M
    _PyUnicode_STATE(unicode).compact = 1;
1429
513M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1430
513M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1431
513M
    if (is_ascii) {
1432
296M
        ((char*)data)[size] = 0;
1433
296M
    }
1434
216M
    else if (kind == PyUnicode_1BYTE_KIND) {
1435
25.4M
        ((char*)data)[size] = 0;
1436
25.4M
        unicode->utf8 = NULL;
1437
25.4M
        unicode->utf8_length = 0;
1438
25.4M
    }
1439
191M
    else {
1440
191M
        unicode->utf8 = NULL;
1441
191M
        unicode->utf8_length = 0;
1442
191M
        if (kind == PyUnicode_2BYTE_KIND)
1443
183M
            ((Py_UCS2*)data)[size] = 0;
1444
8.19M
        else /* kind == PyUnicode_4BYTE_KIND */
1445
8.19M
            ((Py_UCS4*)data)[size] = 0;
1446
191M
    }
1447
#ifdef Py_DEBUG
1448
    unicode_fill_invalid((PyObject*)unicode, 0);
1449
#endif
1450
513M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1451
513M
    return obj;
1452
513M
}
1453
1454
static int
1455
unicode_check_modifiable(PyObject *unicode)
1456
698
{
1457
698
    if (!unicode_modifiable(unicode)) {
1458
0
        PyErr_SetString(PyExc_SystemError,
1459
0
                        "Cannot modify a string currently used");
1460
0
        return -1;
1461
0
    }
1462
698
    return 0;
1463
698
}
1464
1465
static int
1466
_copy_characters(PyObject *to, Py_ssize_t to_start,
1467
                 PyObject *from, Py_ssize_t from_start,
1468
                 Py_ssize_t how_many, int check_maxchar)
1469
281M
{
1470
281M
    int from_kind, to_kind;
1471
281M
    const void *from_data;
1472
281M
    void *to_data;
1473
1474
281M
    assert(0 <= how_many);
1475
281M
    assert(0 <= from_start);
1476
281M
    assert(0 <= to_start);
1477
281M
    assert(PyUnicode_Check(from));
1478
281M
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1479
1480
281M
    assert(to == NULL || PyUnicode_Check(to));
1481
1482
281M
    if (how_many == 0) {
1483
317k
        return 0;
1484
317k
    }
1485
1486
280M
    assert(to != NULL);
1487
280M
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1488
1489
280M
    from_kind = PyUnicode_KIND(from);
1490
280M
    from_data = PyUnicode_DATA(from);
1491
280M
    to_kind = PyUnicode_KIND(to);
1492
280M
    to_data = PyUnicode_DATA(to);
1493
1494
#ifdef Py_DEBUG
1495
    if (!check_maxchar
1496
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1497
    {
1498
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1499
        Py_UCS4 ch;
1500
        Py_ssize_t i;
1501
        for (i=0; i < how_many; i++) {
1502
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1503
            assert(ch <= to_maxchar);
1504
        }
1505
    }
1506
#endif
1507
1508
280M
    if (from_kind == to_kind) {
1509
176M
        if (check_maxchar
1510
176M
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1511
0
        {
1512
            /* Writing Latin-1 characters into an ASCII string requires to
1513
               check that all written characters are pure ASCII */
1514
0
            Py_UCS4 max_char;
1515
0
            max_char = ucs1lib_find_max_char(from_data,
1516
0
                                             (const Py_UCS1*)from_data + how_many);
1517
0
            if (max_char >= 128)
1518
0
                return -1;
1519
0
        }
1520
176M
        memcpy((char*)to_data + to_kind * to_start,
1521
176M
                  (const char*)from_data + from_kind * from_start,
1522
176M
                  to_kind * how_many);
1523
176M
    }
1524
104M
    else if (from_kind == PyUnicode_1BYTE_KIND
1525
104M
             && to_kind == PyUnicode_2BYTE_KIND)
1526
88.2M
    {
1527
88.2M
        _PyUnicode_CONVERT_BYTES(
1528
88.2M
            Py_UCS1, Py_UCS2,
1529
88.2M
            PyUnicode_1BYTE_DATA(from) + from_start,
1530
88.2M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1531
88.2M
            PyUnicode_2BYTE_DATA(to) + to_start
1532
88.2M
            );
1533
88.2M
    }
1534
16.0M
    else if (from_kind == PyUnicode_1BYTE_KIND
1535
16.0M
             && to_kind == PyUnicode_4BYTE_KIND)
1536
13.9M
    {
1537
13.9M
        _PyUnicode_CONVERT_BYTES(
1538
13.9M
            Py_UCS1, Py_UCS4,
1539
13.9M
            PyUnicode_1BYTE_DATA(from) + from_start,
1540
13.9M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1541
13.9M
            PyUnicode_4BYTE_DATA(to) + to_start
1542
13.9M
            );
1543
13.9M
    }
1544
2.11M
    else if (from_kind == PyUnicode_2BYTE_KIND
1545
2.11M
             && to_kind == PyUnicode_4BYTE_KIND)
1546
2.08M
    {
1547
2.08M
        _PyUnicode_CONVERT_BYTES(
1548
2.08M
            Py_UCS2, Py_UCS4,
1549
2.08M
            PyUnicode_2BYTE_DATA(from) + from_start,
1550
2.08M
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1551
2.08M
            PyUnicode_4BYTE_DATA(to) + to_start
1552
2.08M
            );
1553
2.08M
    }
1554
33.0k
    else {
1555
33.0k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1556
1557
33.0k
        if (!check_maxchar) {
1558
33.0k
            if (from_kind == PyUnicode_2BYTE_KIND
1559
33.0k
                && to_kind == PyUnicode_1BYTE_KIND)
1560
2.06k
            {
1561
2.06k
                _PyUnicode_CONVERT_BYTES(
1562
2.06k
                    Py_UCS2, Py_UCS1,
1563
2.06k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1564
2.06k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1565
2.06k
                    PyUnicode_1BYTE_DATA(to) + to_start
1566
2.06k
                    );
1567
2.06k
            }
1568
30.9k
            else if (from_kind == PyUnicode_4BYTE_KIND
1569
30.9k
                     && to_kind == PyUnicode_1BYTE_KIND)
1570
11.0k
            {
1571
11.0k
                _PyUnicode_CONVERT_BYTES(
1572
11.0k
                    Py_UCS4, Py_UCS1,
1573
11.0k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1574
11.0k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1575
11.0k
                    PyUnicode_1BYTE_DATA(to) + to_start
1576
11.0k
                    );
1577
11.0k
            }
1578
19.9k
            else if (from_kind == PyUnicode_4BYTE_KIND
1579
19.9k
                     && to_kind == PyUnicode_2BYTE_KIND)
1580
19.9k
            {
1581
19.9k
                _PyUnicode_CONVERT_BYTES(
1582
19.9k
                    Py_UCS4, Py_UCS2,
1583
19.9k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1584
19.9k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1585
19.9k
                    PyUnicode_2BYTE_DATA(to) + to_start
1586
19.9k
                    );
1587
19.9k
            }
1588
0
            else {
1589
0
                Py_UNREACHABLE();
1590
0
            }
1591
33.0k
        }
1592
0
        else {
1593
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1594
0
            Py_UCS4 ch;
1595
0
            Py_ssize_t i;
1596
1597
0
            for (i=0; i < how_many; i++) {
1598
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1599
0
                if (ch > to_maxchar)
1600
0
                    return -1;
1601
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1602
0
            }
1603
0
        }
1604
33.0k
    }
1605
280M
    return 0;
1606
280M
}
1607
1608
void
1609
_PyUnicode_FastCopyCharacters(
1610
    PyObject *to, Py_ssize_t to_start,
1611
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1612
281M
{
1613
281M
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1614
281M
}
1615
1616
Py_ssize_t
1617
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1618
                         PyObject *from, Py_ssize_t from_start,
1619
                         Py_ssize_t how_many)
1620
0
{
1621
0
    int err;
1622
1623
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1624
0
        PyErr_BadInternalCall();
1625
0
        return -1;
1626
0
    }
1627
1628
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1629
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1630
0
        return -1;
1631
0
    }
1632
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1633
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1634
0
        return -1;
1635
0
    }
1636
0
    if (how_many < 0) {
1637
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1638
0
        return -1;
1639
0
    }
1640
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1641
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1642
0
        PyErr_Format(PyExc_SystemError,
1643
0
                     "Cannot write %zi characters at %zi "
1644
0
                     "in a string of %zi characters",
1645
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1646
0
        return -1;
1647
0
    }
1648
1649
0
    if (how_many == 0)
1650
0
        return 0;
1651
1652
0
    if (unicode_check_modifiable(to))
1653
0
        return -1;
1654
1655
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1656
0
    if (err) {
1657
0
        PyErr_Format(PyExc_SystemError,
1658
0
                     "Cannot copy %s characters "
1659
0
                     "into a string of %s characters",
1660
0
                     unicode_kind_name(from),
1661
0
                     unicode_kind_name(to));
1662
0
        return -1;
1663
0
    }
1664
0
    return how_many;
1665
0
}
1666
1667
/* Find the maximum code point and count the number of surrogate pairs so a
1668
   correct string length can be computed before converting a string to UCS4.
1669
   This function counts single surrogates as a character and not as a pair.
1670
1671
   Return 0 on success, or -1 on error. */
1672
static int
1673
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1674
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1675
16.6k
{
1676
16.6k
    const wchar_t *iter;
1677
16.6k
    Py_UCS4 ch;
1678
1679
16.6k
    assert(num_surrogates != NULL && maxchar != NULL);
1680
16.6k
    *num_surrogates = 0;
1681
16.6k
    *maxchar = 0;
1682
1683
368k
    for (iter = begin; iter < end; ) {
1684
#if SIZEOF_WCHAR_T == 2
1685
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1686
            && (iter+1) < end
1687
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1688
        {
1689
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1690
            ++(*num_surrogates);
1691
            iter += 2;
1692
        }
1693
        else
1694
#endif
1695
352k
        {
1696
352k
            ch = *iter;
1697
352k
            iter++;
1698
352k
        }
1699
352k
        if (ch > *maxchar) {
1700
72.0k
            *maxchar = ch;
1701
72.0k
            if (*maxchar > MAX_UNICODE) {
1702
0
                PyErr_Format(PyExc_ValueError,
1703
0
                             "character U+%x is not in range [U+0000; U+%x]",
1704
0
                             ch, MAX_UNICODE);
1705
0
                return -1;
1706
0
            }
1707
72.0k
        }
1708
352k
    }
1709
16.6k
    return 0;
1710
16.6k
}
1711
1712
static void
1713
unicode_dealloc(PyObject *unicode)
1714
523M
{
1715
#ifdef Py_DEBUG
1716
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1717
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1718
    }
1719
#endif
1720
523M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1721
        /* This should never get called, but we also don't want to SEGV if
1722
        * we accidentally decref an immortal string out of existence. Since
1723
        * the string is an immortal object, just re-set the reference count.
1724
        */
1725
#ifdef Py_DEBUG
1726
        Py_UNREACHABLE();
1727
#endif
1728
0
        _Py_SetImmortal(unicode);
1729
0
        return;
1730
0
    }
1731
523M
    switch (_PyUnicode_STATE(unicode).interned) {
1732
523M
        case SSTATE_NOT_INTERNED:
1733
523M
            break;
1734
594k
        case SSTATE_INTERNED_MORTAL:
1735
            /* Remove the object from the intern dict.
1736
             * Before doing so, we set the refcount to 2: the key and value
1737
             * in the interned_dict.
1738
             */
1739
594k
            assert(Py_REFCNT(unicode) == 0);
1740
594k
            Py_SET_REFCNT(unicode, 2);
1741
#ifdef Py_REF_DEBUG
1742
            /* let's be pedantic with the ref total */
1743
            _Py_IncRefTotal(_PyThreadState_GET());
1744
            _Py_IncRefTotal(_PyThreadState_GET());
1745
#endif
1746
594k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1747
594k
            PyObject *interned = get_interned_dict(interp);
1748
594k
            assert(interned != NULL);
1749
594k
            PyObject *popped;
1750
594k
            int r = PyDict_Pop(interned, unicode, &popped);
1751
594k
            if (r == -1) {
1752
0
                PyErr_FormatUnraisable("Exception ignored while "
1753
0
                                       "removing an interned string %R",
1754
0
                                       unicode);
1755
                // We don't know what happened to the string. It's probably
1756
                // best to leak it:
1757
                // - if it was popped, there are no more references to it
1758
                //   so it can't cause trouble (except wasted memory)
1759
                // - if it wasn't popped, it'll remain interned
1760
0
                _Py_SetImmortal(unicode);
1761
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1762
0
                return;
1763
0
            }
1764
594k
            if (r == 0) {
1765
                // The interned string was not found in the interned_dict.
1766
#ifdef Py_DEBUG
1767
                Py_UNREACHABLE();
1768
#endif
1769
0
                _Py_SetImmortal(unicode);
1770
0
                return;
1771
0
            }
1772
            // Successfully popped.
1773
594k
            assert(popped == unicode);
1774
            // Only our `popped` reference should be left; remove it too.
1775
594k
            assert(Py_REFCNT(unicode) == 1);
1776
594k
            Py_SET_REFCNT(unicode, 0);
1777
#ifdef Py_REF_DEBUG
1778
            /* let's be pedantic with the ref total */
1779
            _Py_DecRefTotal(_PyThreadState_GET());
1780
#endif
1781
594k
            break;
1782
0
        default:
1783
            // As with `statically_allocated` above.
1784
#ifdef Py_REF_DEBUG
1785
            Py_UNREACHABLE();
1786
#endif
1787
0
            _Py_SetImmortal(unicode);
1788
0
            return;
1789
523M
    }
1790
523M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1791
162k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1792
162k
    }
1793
523M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1794
10.4M
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1795
10.4M
    }
1796
1797
523M
    Py_TYPE(unicode)->tp_free(unicode);
1798
523M
}
1799
1800
#ifdef Py_DEBUG
1801
static int
1802
unicode_is_singleton(PyObject *unicode)
1803
{
1804
    if (unicode == &_Py_STR(empty)) {
1805
        return 1;
1806
    }
1807
1808
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1809
    if (ascii->length == 1) {
1810
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1811
        if (ch < 256 && LATIN1(ch) == unicode) {
1812
            return 1;
1813
        }
1814
    }
1815
    return 0;
1816
}
1817
#endif
1818
1819
static int
1820
unicode_modifiable(PyObject *unicode)
1821
64.3M
{
1822
64.3M
    assert(_PyUnicode_CHECK(unicode));
1823
64.3M
    if (!_PyObject_IsUniquelyReferenced(unicode))
1824
49.7k
        return 0;
1825
64.2M
    if (PyUnicode_HASH(unicode) != -1)
1826
0
        return 0;
1827
64.2M
    if (PyUnicode_CHECK_INTERNED(unicode))
1828
0
        return 0;
1829
64.2M
    if (!PyUnicode_CheckExact(unicode))
1830
0
        return 0;
1831
#ifdef Py_DEBUG
1832
    /* singleton refcount is greater than 1 */
1833
    assert(!unicode_is_singleton(unicode));
1834
#endif
1835
64.2M
    return 1;
1836
64.2M
}
1837
1838
static int
1839
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1840
631k
{
1841
631k
    PyObject *unicode;
1842
631k
    Py_ssize_t old_length;
1843
1844
631k
    assert(p_unicode != NULL);
1845
631k
    unicode = *p_unicode;
1846
1847
631k
    assert(unicode != NULL);
1848
631k
    assert(PyUnicode_Check(unicode));
1849
631k
    assert(0 <= length);
1850
1851
631k
    old_length = PyUnicode_GET_LENGTH(unicode);
1852
631k
    if (old_length == length)
1853
0
        return 0;
1854
1855
631k
    if (length == 0) {
1856
0
        PyObject *empty = unicode_get_empty();
1857
0
        Py_SETREF(*p_unicode, empty);
1858
0
        return 0;
1859
0
    }
1860
1861
631k
    if (!unicode_modifiable(unicode)) {
1862
0
        PyObject *copy = resize_copy(unicode, length);
1863
0
        if (copy == NULL)
1864
0
            return -1;
1865
0
        Py_SETREF(*p_unicode, copy);
1866
0
        return 0;
1867
0
    }
1868
1869
631k
    if (PyUnicode_IS_COMPACT(unicode)) {
1870
631k
        PyObject *new_unicode = resize_compact(unicode, length);
1871
631k
        if (new_unicode == NULL)
1872
0
            return -1;
1873
631k
        *p_unicode = new_unicode;
1874
631k
        return 0;
1875
631k
    }
1876
0
    return resize_inplace(unicode, length);
1877
631k
}
1878
1879
int
1880
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1881
0
{
1882
0
    PyObject *unicode;
1883
0
    if (p_unicode == NULL) {
1884
0
        PyErr_BadInternalCall();
1885
0
        return -1;
1886
0
    }
1887
0
    unicode = *p_unicode;
1888
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1889
0
    {
1890
0
        PyErr_BadInternalCall();
1891
0
        return -1;
1892
0
    }
1893
0
    return unicode_resize(p_unicode, length);
1894
0
}
1895
1896
/* Copy an ASCII or latin1 char* string into a Python Unicode string.
1897
1898
   WARNING: The function doesn't copy the terminating null character and
1899
   doesn't check the maximum character (may write a latin1 character in an
1900
   ASCII string). */
1901
static void
1902
unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1903
                   const char *str, Py_ssize_t len)
1904
0
{
1905
0
    int kind = PyUnicode_KIND(unicode);
1906
0
    const void *data = PyUnicode_DATA(unicode);
1907
0
    const char *end = str + len;
1908
1909
0
    assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1910
0
    switch (kind) {
1911
0
    case PyUnicode_1BYTE_KIND: {
1912
#ifdef Py_DEBUG
1913
        if (PyUnicode_IS_ASCII(unicode)) {
1914
            Py_UCS4 maxchar = ucs1lib_find_max_char(
1915
                (const Py_UCS1*)str,
1916
                (const Py_UCS1*)str + len);
1917
            assert(maxchar < 128);
1918
        }
1919
#endif
1920
0
        memcpy((char *) data + index, str, len);
1921
0
        break;
1922
0
    }
1923
0
    case PyUnicode_2BYTE_KIND: {
1924
0
        Py_UCS2 *start = (Py_UCS2 *)data + index;
1925
0
        Py_UCS2 *ucs2 = start;
1926
1927
0
        for (; str < end; ++ucs2, ++str)
1928
0
            *ucs2 = (Py_UCS2)*str;
1929
1930
0
        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1931
0
        break;
1932
0
    }
1933
0
    case PyUnicode_4BYTE_KIND: {
1934
0
        Py_UCS4 *start = (Py_UCS4 *)data + index;
1935
0
        Py_UCS4 *ucs4 = start;
1936
1937
0
        for (; str < end; ++ucs4, ++str)
1938
0
            *ucs4 = (Py_UCS4)*str;
1939
1940
0
        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1941
0
        break;
1942
0
    }
1943
0
    default:
1944
0
        Py_UNREACHABLE();
1945
0
    }
1946
0
}
1947
1948
static PyObject*
1949
get_latin1_char(Py_UCS1 ch)
1950
251M
{
1951
251M
    PyObject *o = LATIN1(ch);
1952
251M
    return o;
1953
251M
}
1954
1955
static PyObject*
1956
unicode_char(Py_UCS4 ch)
1957
313M
{
1958
313M
    PyObject *unicode;
1959
1960
313M
    assert(ch <= MAX_UNICODE);
1961
1962
313M
    if (ch < 256) {
1963
188M
        return get_latin1_char(ch);
1964
188M
    }
1965
1966
124M
    unicode = PyUnicode_New(1, ch);
1967
124M
    if (unicode == NULL)
1968
0
        return NULL;
1969
1970
124M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1971
124M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1972
119M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1973
119M
    } else {
1974
5.25M
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1975
5.25M
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1976
5.25M
    }
1977
124M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1978
124M
    return unicode;
1979
124M
}
1980
1981
1982
static inline void
1983
unicode_write_widechar(int kind, void *data,
1984
                       const wchar_t *u, Py_ssize_t size,
1985
                       Py_ssize_t num_surrogates)
1986
16.6k
{
1987
16.6k
    switch (kind) {
1988
16.6k
    case PyUnicode_1BYTE_KIND:
1989
16.6k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1990
16.6k
        break;
1991
1992
0
    case PyUnicode_2BYTE_KIND:
1993
#if SIZEOF_WCHAR_T == 2
1994
        memcpy(data, u, size * 2);
1995
#else
1996
0
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1997
0
#endif
1998
0
        break;
1999
2000
0
    case PyUnicode_4BYTE_KIND:
2001
0
    {
2002
#if SIZEOF_WCHAR_T == 2
2003
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
2004
        // surrogate pairs.
2005
        const wchar_t *end = u + size;
2006
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
2007
#  ifndef NDEBUG
2008
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
2009
#  endif
2010
        for (const wchar_t *iter = u; iter < end; ) {
2011
            assert(ucs4_out < ucs4_end);
2012
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
2013
                && (iter+1) < end
2014
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
2015
            {
2016
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
2017
                iter += 2;
2018
            }
2019
            else {
2020
                *ucs4_out++ = *iter;
2021
                iter++;
2022
            }
2023
        }
2024
        assert(ucs4_out == ucs4_end);
2025
#else
2026
0
        assert(num_surrogates == 0);
2027
0
        memcpy(data, u, size * 4);
2028
0
#endif
2029
0
        break;
2030
0
    }
2031
0
    default:
2032
0
        Py_UNREACHABLE();
2033
16.6k
    }
2034
16.6k
}
2035
2036
2037
PyObject *
2038
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2039
16.7k
{
2040
16.7k
    PyObject *unicode;
2041
16.7k
    Py_UCS4 maxchar = 0;
2042
16.7k
    Py_ssize_t num_surrogates;
2043
2044
16.7k
    if (u == NULL && size != 0) {
2045
0
        PyErr_BadInternalCall();
2046
0
        return NULL;
2047
0
    }
2048
2049
16.7k
    if (size == -1) {
2050
576
        size = wcslen(u);
2051
576
    }
2052
2053
    /* If the Unicode data is known at construction time, we can apply
2054
       some optimizations which share commonly used objects. */
2055
2056
    /* Optimization for empty strings */
2057
16.7k
    if (size == 0)
2058
32
        _Py_RETURN_UNICODE_EMPTY();
2059
2060
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2061
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
2062
       non-Unicode locales and hence needs conversion to UCS-4 first. */
2063
    if (_Py_LocaleUsesNonUnicodeWchar()) {
2064
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2065
        if (!converted) {
2066
            return NULL;
2067
        }
2068
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2069
        PyMem_Free(converted);
2070
        return unicode;
2071
    }
2072
#endif
2073
2074
    /* Single character Unicode objects in the Latin-1 range are
2075
       shared when using this constructor */
2076
16.6k
    if (size == 1 && (Py_UCS4)*u < 256)
2077
0
        return get_latin1_char((unsigned char)*u);
2078
2079
    /* If not empty and not single character, copy the Unicode data
2080
       into the new object */
2081
16.6k
    if (find_maxchar_surrogates(u, u + size,
2082
16.6k
                                &maxchar, &num_surrogates) == -1)
2083
0
        return NULL;
2084
2085
16.6k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
2086
16.6k
    if (!unicode)
2087
0
        return NULL;
2088
2089
16.6k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2090
16.6k
                           u, size, num_surrogates);
2091
2092
16.6k
    return unicode_result(unicode);
2093
16.6k
}
2094
2095
2096
int
2097
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
2098
                              const wchar_t *str,
2099
                              Py_ssize_t size)
2100
0
{
2101
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
2102
2103
0
    if (size < 0) {
2104
0
        size = wcslen(str);
2105
0
    }
2106
2107
0
    if (size == 0) {
2108
0
        return 0;
2109
0
    }
2110
2111
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2112
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
2113
       non-Unicode locales and hence needs conversion to UCS-4 first. */
2114
    if (_Py_LocaleUsesNonUnicodeWchar()) {
2115
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
2116
        if (!converted) {
2117
            return -1;
2118
        }
2119
2120
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
2121
        PyMem_Free(converted);
2122
        return res;
2123
    }
2124
#endif
2125
2126
0
    Py_UCS4 maxchar = 0;
2127
0
    Py_ssize_t num_surrogates;
2128
0
    if (find_maxchar_surrogates(str, str + size,
2129
0
                                &maxchar, &num_surrogates) == -1) {
2130
0
        return -1;
2131
0
    }
2132
2133
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
2134
0
        return -1;
2135
0
    }
2136
2137
0
    int kind = writer->kind;
2138
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2139
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
2140
2141
0
    writer->pos += size - num_surrogates;
2142
0
    return 0;
2143
0
}
2144
2145
2146
PyObject *
2147
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2148
556k
{
2149
556k
    if (size < 0) {
2150
0
        PyErr_SetString(PyExc_SystemError,
2151
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2152
0
        return NULL;
2153
0
    }
2154
556k
    if (u != NULL) {
2155
556k
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2156
556k
    }
2157
0
    if (size > 0) {
2158
0
        PyErr_SetString(PyExc_SystemError,
2159
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2160
0
        return NULL;
2161
0
    }
2162
0
    return unicode_get_empty();
2163
0
}
2164
2165
PyObject *
2166
PyUnicode_FromString(const char *u)
2167
7.40M
{
2168
7.40M
    size_t size = strlen(u);
2169
7.40M
    if (size > PY_SSIZE_T_MAX) {
2170
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2171
0
        return NULL;
2172
0
    }
2173
7.40M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2174
7.40M
}
2175
2176
2177
PyObject *
2178
_PyUnicode_FromId(_Py_Identifier *id)
2179
0
{
2180
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2181
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2182
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2183
2184
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2185
0
    if (index < 0) {
2186
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2187
2188
0
        PyMutex_Lock(&rt_ids->mutex);
2189
        // Check again to detect concurrent access. Another thread can have
2190
        // initialized the index while this thread waited for the lock.
2191
0
        index = _Py_atomic_load_ssize(&id->index);
2192
0
        if (index < 0) {
2193
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2194
0
            index = rt_ids->next_index;
2195
0
            rt_ids->next_index++;
2196
0
            _Py_atomic_store_ssize(&id->index, index);
2197
0
        }
2198
0
        PyMutex_Unlock(&rt_ids->mutex);
2199
0
    }
2200
0
    assert(index >= 0);
2201
2202
0
    PyObject *obj;
2203
0
    if (index < ids->size) {
2204
0
        obj = ids->array[index];
2205
0
        if (obj) {
2206
            // Return a borrowed reference
2207
0
            goto end;
2208
0
        }
2209
0
    }
2210
2211
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2212
0
                                       NULL, NULL);
2213
0
    if (!obj) {
2214
0
        goto end;
2215
0
    }
2216
0
    _PyUnicode_InternImmortal(interp, &obj);
2217
2218
0
    if (index >= ids->size) {
2219
        // Overallocate to reduce the number of realloc
2220
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2221
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2222
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2223
0
        if (new_array == NULL) {
2224
0
            PyErr_NoMemory();
2225
0
            obj = NULL;
2226
0
            goto end;
2227
0
        }
2228
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2229
0
        ids->array = new_array;
2230
0
        ids->size = new_size;
2231
0
    }
2232
2233
    // The array stores a strong reference
2234
0
    ids->array[index] = obj;
2235
2236
0
end:
2237
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2238
    // Return a borrowed reference
2239
0
    return obj;
2240
0
}
2241
2242
2243
static void
2244
unicode_clear_identifiers(struct _Py_unicode_state *state)
2245
0
{
2246
0
    struct _Py_unicode_ids *ids = &state->ids;
2247
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2248
0
        Py_XDECREF(ids->array[i]);
2249
0
    }
2250
0
    ids->size = 0;
2251
0
    PyMem_Free(ids->array);
2252
0
    ids->array = NULL;
2253
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2254
    // after Py_Finalize().
2255
0
}
2256
2257
2258
/* Internal function, doesn't check maximum character */
2259
2260
PyObject*
2261
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2262
95.9M
{
2263
95.9M
    const unsigned char *s = (const unsigned char *)buffer;
2264
95.9M
    PyObject *unicode;
2265
95.9M
    if (size == 1) {
2266
#ifdef Py_DEBUG
2267
        assert((unsigned char)s[0] < 128);
2268
#endif
2269
38.2M
        return get_latin1_char(s[0]);
2270
38.2M
    }
2271
57.6M
    unicode = PyUnicode_New(size, 127);
2272
57.6M
    if (!unicode)
2273
0
        return NULL;
2274
57.6M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2275
57.6M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2276
57.6M
    return unicode;
2277
57.6M
}
2278
2279
static Py_UCS4
2280
kind_maxchar_limit(int kind)
2281
0
{
2282
0
    switch (kind) {
2283
0
    case PyUnicode_1BYTE_KIND:
2284
0
        return 0x80;
2285
0
    case PyUnicode_2BYTE_KIND:
2286
0
        return 0x100;
2287
0
    case PyUnicode_4BYTE_KIND:
2288
0
        return 0x10000;
2289
0
    default:
2290
0
        Py_UNREACHABLE();
2291
0
    }
2292
0
}
2293
2294
static PyObject*
2295
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2296
64.1M
{
2297
64.1M
    PyObject *res;
2298
64.1M
    unsigned char max_char;
2299
2300
64.1M
    if (size == 0) {
2301
4.45M
        _Py_RETURN_UNICODE_EMPTY();
2302
4.45M
    }
2303
59.6M
    assert(size > 0);
2304
59.6M
    if (size == 1) {
2305
22.2M
        return get_latin1_char(u[0]);
2306
22.2M
    }
2307
2308
37.4M
    max_char = ucs1lib_find_max_char(u, u + size);
2309
37.4M
    res = PyUnicode_New(size, max_char);
2310
37.4M
    if (!res)
2311
0
        return NULL;
2312
37.4M
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2313
37.4M
    assert(_PyUnicode_CheckConsistency(res, 1));
2314
37.4M
    return res;
2315
37.4M
}
2316
2317
static PyObject*
2318
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2319
112M
{
2320
112M
    PyObject *res;
2321
112M
    Py_UCS2 max_char;
2322
2323
112M
    if (size == 0)
2324
12.3M
        _Py_RETURN_UNICODE_EMPTY();
2325
100M
    assert(size > 0);
2326
100M
    if (size == 1)
2327
69.9M
        return unicode_char(u[0]);
2328
2329
30.6M
    max_char = ucs2lib_find_max_char(u, u + size);
2330
30.6M
    res = PyUnicode_New(size, max_char);
2331
30.6M
    if (!res)
2332
0
        return NULL;
2333
30.6M
    if (max_char >= 256)
2334
17.1M
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2335
13.5M
    else {
2336
13.5M
        _PyUnicode_CONVERT_BYTES(
2337
13.5M
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2338
13.5M
    }
2339
30.6M
    assert(_PyUnicode_CheckConsistency(res, 1));
2340
30.6M
    return res;
2341
30.6M
}
2342
2343
static PyObject*
2344
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2345
86.2M
{
2346
86.2M
    PyObject *res;
2347
86.2M
    Py_UCS4 max_char;
2348
2349
86.2M
    if (size == 0)
2350
7.11M
        _Py_RETURN_UNICODE_EMPTY();
2351
79.1M
    assert(size > 0);
2352
79.1M
    if (size == 1)
2353
56.9M
        return unicode_char(u[0]);
2354
2355
22.1M
    max_char = ucs4lib_find_max_char(u, u + size);
2356
22.1M
    res = PyUnicode_New(size, max_char);
2357
22.1M
    if (!res)
2358
0
        return NULL;
2359
22.1M
    if (max_char < 256)
2360
16.6M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2361
22.1M
                                 PyUnicode_1BYTE_DATA(res));
2362
5.52M
    else if (max_char < 0x10000)
2363
3.41M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2364
5.52M
                                 PyUnicode_2BYTE_DATA(res));
2365
2.10M
    else
2366
2.10M
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2367
22.1M
    assert(_PyUnicode_CheckConsistency(res, 1));
2368
22.1M
    return res;
2369
22.1M
}
2370
2371
2372
int
2373
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2374
                          Py_UCS4 *str,
2375
                          Py_ssize_t size)
2376
0
{
2377
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2378
2379
0
    if (size < 0) {
2380
0
        PyErr_SetString(PyExc_ValueError,
2381
0
                        "size must be positive");
2382
0
        return -1;
2383
0
    }
2384
2385
0
    if (size == 0) {
2386
0
        return 0;
2387
0
    }
2388
2389
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2390
2391
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2392
0
        return -1;
2393
0
    }
2394
2395
0
    int kind = writer->kind;
2396
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2397
0
    if (kind == PyUnicode_1BYTE_KIND) {
2398
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2399
0
                                 str, str + size,
2400
0
                                 data);
2401
0
    }
2402
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2403
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2404
0
                                 str, str + size,
2405
0
                                 data);
2406
0
    }
2407
0
    else {
2408
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2409
0
    }
2410
0
    writer->pos += size;
2411
2412
0
    return 0;
2413
0
}
2414
2415
2416
PyObject*
2417
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2418
216M
{
2419
216M
    if (size < 0) {
2420
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2421
0
        return NULL;
2422
0
    }
2423
216M
    switch (kind) {
2424
43.4M
    case PyUnicode_1BYTE_KIND:
2425
43.4M
        return _PyUnicode_FromUCS1(buffer, size);
2426
97.8M
    case PyUnicode_2BYTE_KIND:
2427
97.8M
        return _PyUnicode_FromUCS2(buffer, size);
2428
74.9M
    case PyUnicode_4BYTE_KIND:
2429
74.9M
        return _PyUnicode_FromUCS4(buffer, size);
2430
0
    default:
2431
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2432
0
        return NULL;
2433
216M
    }
2434
216M
}
2435
2436
Py_UCS4
2437
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2438
15.4M
{
2439
15.4M
    int kind;
2440
15.4M
    const void *startptr, *endptr;
2441
2442
15.4M
    assert(0 <= start);
2443
15.4M
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2444
15.4M
    assert(start <= end);
2445
2446
15.4M
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2447
0
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2448
2449
15.4M
    if (start == end)
2450
0
        return 127;
2451
2452
15.4M
    if (PyUnicode_IS_ASCII(unicode))
2453
15.4M
        return 127;
2454
2455
42.1k
    kind = PyUnicode_KIND(unicode);
2456
42.1k
    startptr = PyUnicode_DATA(unicode);
2457
42.1k
    endptr = (char *)startptr + end * kind;
2458
42.1k
    startptr = (char *)startptr + start * kind;
2459
42.1k
    switch(kind) {
2460
1.86k
    case PyUnicode_1BYTE_KIND:
2461
1.86k
        return ucs1lib_find_max_char(startptr, endptr);
2462
3.95k
    case PyUnicode_2BYTE_KIND:
2463
3.95k
        return ucs2lib_find_max_char(startptr, endptr);
2464
36.3k
    case PyUnicode_4BYTE_KIND:
2465
36.3k
        return ucs4lib_find_max_char(startptr, endptr);
2466
0
    default:
2467
0
        Py_UNREACHABLE();
2468
42.1k
    }
2469
42.1k
}
2470
2471
/* Ensure that a string uses the most efficient storage, if it is not the
2472
   case: create a new string with of the right kind. Write NULL into *p_unicode
2473
   on error. */
2474
static void
2475
unicode_adjust_maxchar(PyObject **p_unicode)
2476
0
{
2477
0
    PyObject *unicode, *copy;
2478
0
    Py_UCS4 max_char;
2479
0
    Py_ssize_t len;
2480
0
    int kind;
2481
2482
0
    assert(p_unicode != NULL);
2483
0
    unicode = *p_unicode;
2484
0
    if (PyUnicode_IS_ASCII(unicode))
2485
0
        return;
2486
2487
0
    len = PyUnicode_GET_LENGTH(unicode);
2488
0
    kind = PyUnicode_KIND(unicode);
2489
0
    if (kind == PyUnicode_1BYTE_KIND) {
2490
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2491
0
        max_char = ucs1lib_find_max_char(u, u + len);
2492
0
        if (max_char >= 128)
2493
0
            return;
2494
0
    }
2495
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2496
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2497
0
        max_char = ucs2lib_find_max_char(u, u + len);
2498
0
        if (max_char >= 256)
2499
0
            return;
2500
0
    }
2501
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2502
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2503
0
        max_char = ucs4lib_find_max_char(u, u + len);
2504
0
        if (max_char >= 0x10000)
2505
0
            return;
2506
0
    }
2507
0
    else
2508
0
        Py_UNREACHABLE();
2509
2510
0
    copy = PyUnicode_New(len, max_char);
2511
0
    if (copy != NULL)
2512
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2513
0
    Py_DECREF(unicode);
2514
0
    *p_unicode = copy;
2515
0
}
2516
2517
PyObject*
2518
_PyUnicode_Copy(PyObject *unicode)
2519
3.22M
{
2520
3.22M
    Py_ssize_t length;
2521
3.22M
    PyObject *copy;
2522
2523
3.22M
    if (!PyUnicode_Check(unicode)) {
2524
0
        PyErr_BadInternalCall();
2525
0
        return NULL;
2526
0
    }
2527
2528
3.22M
    length = PyUnicode_GET_LENGTH(unicode);
2529
3.22M
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2530
3.22M
    if (!copy)
2531
0
        return NULL;
2532
3.22M
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2533
2534
3.22M
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2535
3.22M
              length * PyUnicode_KIND(unicode));
2536
3.22M
    assert(_PyUnicode_CheckConsistency(copy, 1));
2537
3.22M
    return copy;
2538
3.22M
}
2539
2540
2541
/* Widen Unicode objects to larger buffers. Don't write terminating null
2542
   character. Return NULL on error. */
2543
2544
static void*
2545
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2546
9.08M
{
2547
9.08M
    void *result;
2548
2549
9.08M
    assert(skind < kind);
2550
9.08M
    switch (kind) {
2551
8.12M
    case PyUnicode_2BYTE_KIND:
2552
8.12M
        result = PyMem_New(Py_UCS2, len);
2553
8.12M
        if (!result)
2554
0
            return PyErr_NoMemory();
2555
8.12M
        assert(skind == PyUnicode_1BYTE_KIND);
2556
8.12M
        _PyUnicode_CONVERT_BYTES(
2557
8.12M
            Py_UCS1, Py_UCS2,
2558
8.12M
            (const Py_UCS1 *)data,
2559
8.12M
            ((const Py_UCS1 *)data) + len,
2560
8.12M
            result);
2561
8.12M
        return result;
2562
960k
    case PyUnicode_4BYTE_KIND:
2563
960k
        result = PyMem_New(Py_UCS4, len);
2564
960k
        if (!result)
2565
0
            return PyErr_NoMemory();
2566
960k
        if (skind == PyUnicode_2BYTE_KIND) {
2567
0
            _PyUnicode_CONVERT_BYTES(
2568
0
                Py_UCS2, Py_UCS4,
2569
0
                (const Py_UCS2 *)data,
2570
0
                ((const Py_UCS2 *)data) + len,
2571
0
                result);
2572
0
        }
2573
960k
        else {
2574
960k
            assert(skind == PyUnicode_1BYTE_KIND);
2575
960k
            _PyUnicode_CONVERT_BYTES(
2576
960k
                Py_UCS1, Py_UCS4,
2577
960k
                (const Py_UCS1 *)data,
2578
960k
                ((const Py_UCS1 *)data) + len,
2579
960k
                result);
2580
960k
        }
2581
960k
        return result;
2582
0
    default:
2583
0
        Py_UNREACHABLE();
2584
0
        return NULL;
2585
9.08M
    }
2586
9.08M
}
2587
2588
static Py_UCS4*
2589
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2590
        int copy_null)
2591
75.3k
{
2592
75.3k
    int kind;
2593
75.3k
    const void *data;
2594
75.3k
    Py_ssize_t len, targetlen;
2595
75.3k
    kind = PyUnicode_KIND(string);
2596
75.3k
    data = PyUnicode_DATA(string);
2597
75.3k
    len = PyUnicode_GET_LENGTH(string);
2598
75.3k
    targetlen = len;
2599
75.3k
    if (copy_null)
2600
0
        targetlen++;
2601
75.3k
    if (!target) {
2602
0
        target = PyMem_New(Py_UCS4, targetlen);
2603
0
        if (!target) {
2604
0
            PyErr_NoMemory();
2605
0
            return NULL;
2606
0
        }
2607
0
    }
2608
75.3k
    else {
2609
75.3k
        if (targetsize < targetlen) {
2610
0
            PyErr_Format(PyExc_SystemError,
2611
0
                         "string is longer than the buffer");
2612
0
            if (copy_null && 0 < targetsize)
2613
0
                target[0] = 0;
2614
0
            return NULL;
2615
0
        }
2616
75.3k
    }
2617
75.3k
    if (kind == PyUnicode_1BYTE_KIND) {
2618
54.3k
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2619
54.3k
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2620
54.3k
    }
2621
20.9k
    else if (kind == PyUnicode_2BYTE_KIND) {
2622
15.8k
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2623
15.8k
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2624
15.8k
    }
2625
5.14k
    else if (kind == PyUnicode_4BYTE_KIND) {
2626
5.14k
        memcpy(target, data, len * sizeof(Py_UCS4));
2627
5.14k
    }
2628
0
    else {
2629
0
        Py_UNREACHABLE();
2630
0
    }
2631
75.3k
    if (copy_null)
2632
0
        target[len] = 0;
2633
75.3k
    return target;
2634
75.3k
}
2635
2636
Py_UCS4*
2637
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2638
                 int copy_null)
2639
75.3k
{
2640
75.3k
    if (target == NULL || targetsize < 0) {
2641
0
        PyErr_BadInternalCall();
2642
0
        return NULL;
2643
0
    }
2644
75.3k
    return as_ucs4(string, target, targetsize, copy_null);
2645
75.3k
}
2646
2647
Py_UCS4*
2648
PyUnicode_AsUCS4Copy(PyObject *string)
2649
0
{
2650
0
    return as_ucs4(string, NULL, 0, 1);
2651
0
}
2652
2653
/* maximum number of characters required for output of %jo or %jd or %p.
2654
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2655
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2656
   plus 1 for the terminal NUL. */
2657
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2658
2659
static int
2660
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2661
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2662
15.8k
{
2663
15.8k
    Py_ssize_t length, fill, arglen;
2664
15.8k
    Py_UCS4 maxchar;
2665
2666
15.8k
    length = PyUnicode_GET_LENGTH(str);
2667
15.8k
    if ((precision == -1 || precision >= length)
2668
15.8k
        && width <= length)
2669
15.8k
        return _PyUnicodeWriter_WriteStr(writer, str);
2670
2671
65
    if (precision != -1)
2672
65
        length = Py_MIN(precision, length);
2673
2674
65
    arglen = Py_MAX(length, width);
2675
65
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2676
39
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2677
26
    else
2678
26
        maxchar = writer->maxchar;
2679
2680
65
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2681
0
        return -1;
2682
2683
65
    fill = Py_MAX(width - length, 0);
2684
65
    if (fill && !(flags & F_LJUST)) {
2685
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2686
0
            return -1;
2687
0
        writer->pos += fill;
2688
0
    }
2689
2690
65
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2691
65
                                  str, 0, length);
2692
65
    writer->pos += length;
2693
2694
65
    if (fill && (flags & F_LJUST)) {
2695
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2696
0
            return -1;
2697
0
        writer->pos += fill;
2698
0
    }
2699
2700
65
    return 0;
2701
65
}
2702
2703
static int
2704
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2705
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2706
5.39M
{
2707
    /* UTF-8 */
2708
5.39M
    Py_ssize_t *pconsumed = NULL;
2709
5.39M
    Py_ssize_t length;
2710
5.39M
    if (precision == -1) {
2711
218k
        length = strlen(str);
2712
218k
    }
2713
5.17M
    else {
2714
5.17M
        length = 0;
2715
21.5M
        while (length < precision && str[length]) {
2716
16.3M
            length++;
2717
16.3M
        }
2718
5.17M
        if (length == precision) {
2719
            /* The input string is not NUL-terminated.  If it ends with an
2720
             * incomplete UTF-8 sequence, truncate the string just before it.
2721
             * Incomplete sequences in the middle and sequences which cannot
2722
             * be valid prefixes are still treated as errors and replaced
2723
             * with \xfffd. */
2724
3.33k
            pconsumed = &length;
2725
3.33k
        }
2726
5.17M
    }
2727
2728
5.39M
    if (width < 0) {
2729
5.39M
        return unicode_decode_utf8_writer(writer, str, length,
2730
5.39M
                                          _Py_ERROR_REPLACE, "replace", pconsumed);
2731
5.39M
    }
2732
2733
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2734
0
                                                     "replace", pconsumed);
2735
0
    if (unicode == NULL)
2736
0
        return -1;
2737
2738
0
    int res = unicode_fromformat_write_str(writer, unicode,
2739
0
                                           width, -1, flags);
2740
0
    Py_DECREF(unicode);
2741
0
    return res;
2742
0
}
2743
2744
static int
2745
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2746
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2747
0
{
2748
0
    Py_ssize_t length;
2749
0
    if (precision == -1) {
2750
0
        length = wcslen(str);
2751
0
    }
2752
0
    else {
2753
0
        length = 0;
2754
0
        while (length < precision && str[length]) {
2755
0
            length++;
2756
0
        }
2757
0
    }
2758
2759
0
    if (width < 0) {
2760
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2761
0
                                             str, length);
2762
0
    }
2763
2764
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2765
0
    if (unicode == NULL)
2766
0
        return -1;
2767
2768
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2769
0
    Py_DECREF(unicode);
2770
0
    return res;
2771
0
}
2772
2773
0
#define F_LONG 1
2774
0
#define F_LONGLONG 2
2775
85.9k
#define F_SIZE 3
2776
0
#define F_PTRDIFF 4
2777
0
#define F_INTMAX 5
2778
2779
static const char*
2780
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2781
                       const char *f, va_list *vargs)
2782
36.0M
{
2783
36.0M
    const char *p;
2784
36.0M
    Py_ssize_t len;
2785
36.0M
    int flags = 0;
2786
36.0M
    Py_ssize_t width;
2787
36.0M
    Py_ssize_t precision;
2788
2789
36.0M
    p = f;
2790
36.0M
    f++;
2791
36.0M
    if (*f == '%') {
2792
5.16M
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2793
0
            return NULL;
2794
5.16M
        f++;
2795
5.16M
        return f;
2796
5.16M
    }
2797
2798
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2799
    /* Flags '+', ' ' and '#' are not particularly useful.
2800
     * They are not worth the implementation and maintenance costs.
2801
     * In addition, '#' should add "0" for "o" conversions for compatibility
2802
     * with printf, but it would confuse Python users. */
2803
30.8M
    while (1) {
2804
30.8M
        switch (*f++) {
2805
0
        case '-': flags |= F_LJUST; continue;
2806
2.70k
        case '0': flags |= F_ZERO; continue;
2807
0
        case '#': flags |= F_ALT; continue;
2808
30.8M
        }
2809
30.8M
        f--;
2810
30.8M
        break;
2811
30.8M
    }
2812
2813
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2814
30.8M
    width = -1;
2815
30.8M
    if (*f == '*') {
2816
0
        width = va_arg(*vargs, int);
2817
0
        if (width < 0) {
2818
0
            flags |= F_LJUST;
2819
0
            width = -width;
2820
0
        }
2821
0
        f++;
2822
0
    }
2823
30.8M
    else if (Py_ISDIGIT((unsigned)*f)) {
2824
2.70k
        width = *f - '0';
2825
2.70k
        f++;
2826
2.70k
        while (Py_ISDIGIT((unsigned)*f)) {
2827
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2828
0
                PyErr_SetString(PyExc_ValueError,
2829
0
                                "width too big");
2830
0
                return NULL;
2831
0
            }
2832
0
            width = (width * 10) + (*f - '0');
2833
0
            f++;
2834
0
        }
2835
2.70k
    }
2836
30.8M
    precision = -1;
2837
30.8M
    if (*f == '.') {
2838
5.18M
        f++;
2839
5.18M
        if (*f == '*') {
2840
0
            precision = va_arg(*vargs, int);
2841
0
            if (precision < 0) {
2842
0
                precision = -2;
2843
0
            }
2844
0
            f++;
2845
0
        }
2846
5.18M
        else if (Py_ISDIGIT((unsigned)*f)) {
2847
5.18M
            precision = (*f - '0');
2848
5.18M
            f++;
2849
15.5M
            while (Py_ISDIGIT((unsigned)*f)) {
2850
10.3M
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2851
0
                    PyErr_SetString(PyExc_ValueError,
2852
0
                                    "precision too big");
2853
0
                    return NULL;
2854
0
                }
2855
10.3M
                precision = (precision * 10) + (*f - '0');
2856
10.3M
                f++;
2857
10.3M
            }
2858
5.18M
        }
2859
5.18M
    }
2860
2861
30.8M
    int sizemod = 0;
2862
30.8M
    if (*f == 'l') {
2863
0
        if (f[1] == 'l') {
2864
0
            sizemod = F_LONGLONG;
2865
0
            f += 2;
2866
0
        }
2867
0
        else {
2868
0
            sizemod = F_LONG;
2869
0
            ++f;
2870
0
        }
2871
0
    }
2872
30.8M
    else if (*f == 'z') {
2873
42.9k
        sizemod = F_SIZE;
2874
42.9k
        ++f;
2875
42.9k
    }
2876
30.7M
    else if (*f == 't') {
2877
0
        sizemod = F_PTRDIFF;
2878
0
        ++f;
2879
0
    }
2880
30.7M
    else if (*f == 'j') {
2881
0
        sizemod = F_INTMAX;
2882
0
        ++f;
2883
0
    }
2884
30.8M
    if (f[0] != '\0' && f[1] == '\0')
2885
5.27M
        writer->overallocate = 0;
2886
2887
30.8M
    switch (*f) {
2888
20.2M
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2889
20.2M
        break;
2890
5.17M
    case 'c': case 'p':
2891
5.17M
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2892
5.17M
        break;
2893
5.39M
    case 's':
2894
5.39M
    case 'V':
2895
5.39M
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2896
5.39M
        break;
2897
5.39M
    default:
2898
15.8k
        if (sizemod) goto invalid_format;
2899
15.8k
        break;
2900
30.8M
    }
2901
2902
30.8M
    switch (*f) {
2903
5.17M
    case 'c':
2904
5.17M
    {
2905
5.17M
        int ordinal = va_arg(*vargs, int);
2906
5.17M
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2907
0
            PyErr_SetString(PyExc_OverflowError,
2908
0
                            "character argument not in range(0x110000)");
2909
0
            return NULL;
2910
0
        }
2911
5.17M
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2912
0
            return NULL;
2913
5.17M
        break;
2914
5.17M
    }
2915
2916
20.2M
    case 'd': case 'i':
2917
20.2M
    case 'o': case 'u': case 'x': case 'X':
2918
20.2M
    {
2919
20.2M
        char buffer[MAX_INTMAX_CHARS];
2920
2921
        // Fill buffer using sprinf, with one of many possible format
2922
        // strings, like "%llX" for `long long` in hexadecimal.
2923
        // The type/size is in `sizemod`; the format is in `*f`.
2924
2925
        // Use macros with nested switches to keep the sprintf format strings
2926
        // as compile-time literals, avoiding warnings and maybe allowing
2927
        // optimizations.
2928
2929
        // `SPRINT` macro does one sprintf
2930
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2931
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2932
20.2M
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2933
20.2M
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2934
2935
        // One inner switch to handle all format variants
2936
20.2M
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2937
20.2M
            switch (*f) {                                                     \
2938
0
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2939
0
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2940
1.60k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2941
1.10k
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2942
20.2M
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2943
20.2M
            }
2944
2945
        // Outer switch to handle all the sizes/types
2946
20.2M
        switch (sizemod) {
2947
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2948
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2949
42.9k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2950
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2951
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2952
20.2M
            default:         DO_SPRINTS("", int, unsigned int); break;
2953
20.2M
        }
2954
20.2M
        #undef SPRINT
2955
20.2M
        #undef DO_SPRINTS
2956
2957
20.2M
        assert(len >= 0);
2958
2959
20.2M
        int sign = (buffer[0] == '-');
2960
20.2M
        len -= sign;
2961
2962
20.2M
        precision = Py_MAX(precision, len);
2963
20.2M
        width = Py_MAX(width, precision + sign);
2964
20.2M
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2965
2.70k
            precision = width - sign;
2966
2.70k
        }
2967
2968
20.2M
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2969
20.2M
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2970
2971
20.2M
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2972
0
            return NULL;
2973
2974
20.2M
        if (spacepad && !(flags & F_LJUST)) {
2975
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2976
0
                return NULL;
2977
0
            writer->pos += spacepad;
2978
0
        }
2979
2980
20.2M
        if (sign) {
2981
0
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2982
0
                return NULL;
2983
0
        }
2984
2985
20.2M
        if (zeropad) {
2986
698
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2987
0
                return NULL;
2988
698
            writer->pos += zeropad;
2989
698
        }
2990
2991
20.2M
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2992
0
            return NULL;
2993
2994
20.2M
        if (spacepad && (flags & F_LJUST)) {
2995
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2996
0
                return NULL;
2997
0
            writer->pos += spacepad;
2998
0
        }
2999
20.2M
        break;
3000
20.2M
    }
3001
3002
20.2M
    case 'p':
3003
0
    {
3004
0
        char number[MAX_INTMAX_CHARS];
3005
3006
0
        len = sprintf(number, "%p", va_arg(*vargs, void*));
3007
0
        assert(len >= 0);
3008
3009
        /* %p is ill-defined:  ensure leading 0x. */
3010
0
        if (number[1] == 'X')
3011
0
            number[1] = 'x';
3012
0
        else if (number[1] != 'x') {
3013
0
            memmove(number + 2, number,
3014
0
                    strlen(number) + 1);
3015
0
            number[0] = '0';
3016
0
            number[1] = 'x';
3017
0
            len += 2;
3018
0
        }
3019
3020
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
3021
0
            return NULL;
3022
0
        break;
3023
0
    }
3024
3025
5.39M
    case 's':
3026
5.39M
    {
3027
5.39M
        if (sizemod) {
3028
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
3029
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
3030
0
                return NULL;
3031
0
        }
3032
5.39M
        else {
3033
            /* UTF-8 */
3034
5.39M
            const char *s = va_arg(*vargs, const char*);
3035
5.39M
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
3036
0
                return NULL;
3037
5.39M
        }
3038
5.39M
        break;
3039
5.39M
    }
3040
3041
5.39M
    case 'U':
3042
15.2k
    {
3043
15.2k
        PyObject *obj = va_arg(*vargs, PyObject *);
3044
15.2k
        assert(obj && _PyUnicode_CHECK(obj));
3045
3046
15.2k
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
3047
0
            return NULL;
3048
15.2k
        break;
3049
15.2k
    }
3050
3051
15.2k
    case 'V':
3052
0
    {
3053
0
        PyObject *obj = va_arg(*vargs, PyObject *);
3054
0
        const char *str;
3055
0
        const wchar_t *wstr;
3056
0
        if (sizemod) {
3057
0
            wstr = va_arg(*vargs, const wchar_t*);
3058
0
        }
3059
0
        else {
3060
0
            str = va_arg(*vargs, const char *);
3061
0
        }
3062
0
        if (obj) {
3063
0
            assert(_PyUnicode_CHECK(obj));
3064
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
3065
0
                return NULL;
3066
0
        }
3067
0
        else if (sizemod) {
3068
0
            assert(wstr != NULL);
3069
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
3070
0
                return NULL;
3071
0
        }
3072
0
        else {
3073
0
            assert(str != NULL);
3074
0
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
3075
0
                return NULL;
3076
0
        }
3077
0
        break;
3078
0
    }
3079
3080
27
    case 'S':
3081
27
    {
3082
27
        PyObject *obj = va_arg(*vargs, PyObject *);
3083
27
        PyObject *str;
3084
27
        assert(obj);
3085
27
        str = PyObject_Str(obj);
3086
27
        if (!str)
3087
0
            return NULL;
3088
27
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
3089
0
            Py_DECREF(str);
3090
0
            return NULL;
3091
0
        }
3092
27
        Py_DECREF(str);
3093
27
        break;
3094
27
    }
3095
3096
644
    case 'R':
3097
644
    {
3098
644
        PyObject *obj = va_arg(*vargs, PyObject *);
3099
644
        PyObject *repr;
3100
644
        assert(obj);
3101
644
        repr = PyObject_Repr(obj);
3102
644
        if (!repr)
3103
0
            return NULL;
3104
644
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
3105
0
            Py_DECREF(repr);
3106
0
            return NULL;
3107
0
        }
3108
644
        Py_DECREF(repr);
3109
644
        break;
3110
644
    }
3111
3112
0
    case 'A':
3113
0
    {
3114
0
        PyObject *obj = va_arg(*vargs, PyObject *);
3115
0
        PyObject *ascii;
3116
0
        assert(obj);
3117
0
        ascii = PyObject_ASCII(obj);
3118
0
        if (!ascii)
3119
0
            return NULL;
3120
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
3121
0
            Py_DECREF(ascii);
3122
0
            return NULL;
3123
0
        }
3124
0
        Py_DECREF(ascii);
3125
0
        break;
3126
0
    }
3127
3128
0
    case 'T':
3129
0
    {
3130
0
        PyObject *obj = va_arg(*vargs, PyObject *);
3131
0
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
3132
3133
0
        PyObject *type_name;
3134
0
        if (flags & F_ALT) {
3135
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3136
0
        }
3137
0
        else {
3138
0
            type_name = PyType_GetFullyQualifiedName(type);
3139
0
        }
3140
0
        Py_DECREF(type);
3141
0
        if (!type_name) {
3142
0
            return NULL;
3143
0
        }
3144
3145
0
        if (unicode_fromformat_write_str(writer, type_name,
3146
0
                                         width, precision, flags) == -1) {
3147
0
            Py_DECREF(type_name);
3148
0
            return NULL;
3149
0
        }
3150
0
        Py_DECREF(type_name);
3151
0
        break;
3152
0
    }
3153
3154
0
    case 'N':
3155
0
    {
3156
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3157
0
        assert(type_raw != NULL);
3158
3159
0
        if (!PyType_Check(type_raw)) {
3160
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3161
0
            return NULL;
3162
0
        }
3163
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3164
3165
0
        PyObject *type_name;
3166
0
        if (flags & F_ALT) {
3167
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3168
0
        }
3169
0
        else {
3170
0
            type_name = PyType_GetFullyQualifiedName(type);
3171
0
        }
3172
0
        if (!type_name) {
3173
0
            return NULL;
3174
0
        }
3175
0
        if (unicode_fromformat_write_str(writer, type_name,
3176
0
                                         width, precision, flags) == -1) {
3177
0
            Py_DECREF(type_name);
3178
0
            return NULL;
3179
0
        }
3180
0
        Py_DECREF(type_name);
3181
0
        break;
3182
0
    }
3183
3184
0
    default:
3185
0
    invalid_format:
3186
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3187
0
        return NULL;
3188
30.8M
    }
3189
3190
30.8M
    f++;
3191
30.8M
    return f;
3192
30.8M
}
3193
3194
static int
3195
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3196
15.4M
{
3197
15.4M
    Py_ssize_t len = strlen(format);
3198
15.4M
    writer->min_length += len + 100;
3199
15.4M
    writer->overallocate = 1;
3200
3201
    // Copy varags to be able to pass a reference to a subfunction.
3202
15.4M
    va_list vargs2;
3203
15.4M
    va_copy(vargs2, vargs);
3204
3205
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3206
    // to be encoded to ASCII.
3207
15.4M
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3208
15.4M
    if (!is_ascii) {
3209
0
        Py_ssize_t i;
3210
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3211
0
        PyErr_Format(PyExc_ValueError,
3212
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3213
0
            "string, got a non-ASCII byte: 0x%02x",
3214
0
            (unsigned char)format[i]);
3215
0
        goto fail;
3216
0
    }
3217
3218
87.3M
    for (const char *f = format; *f; ) {
3219
71.8M
        if (*f == '%') {
3220
36.0M
            f = unicode_fromformat_arg(writer, f, &vargs2);
3221
36.0M
            if (f == NULL)
3222
0
                goto fail;
3223
36.0M
        }
3224
35.8M
        else {
3225
35.8M
            const char *p = strchr(f, '%');
3226
35.8M
            if (p != NULL) {
3227
25.6M
                len = p - f;
3228
25.6M
            }
3229
10.2M
            else {
3230
10.2M
                len = strlen(f);
3231
10.2M
                writer->overallocate = 0;
3232
10.2M
            }
3233
3234
35.8M
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3235
0
                goto fail;
3236
0
            }
3237
35.8M
            f += len;
3238
35.8M
        }
3239
71.8M
    }
3240
15.4M
    va_end(vargs2);
3241
15.4M
    return 0;
3242
3243
0
  fail:
3244
0
    va_end(vargs2);
3245
0
    return -1;
3246
15.4M
}
3247
3248
PyObject *
3249
PyUnicode_FromFormatV(const char *format, va_list vargs)
3250
15.4M
{
3251
15.4M
    _PyUnicodeWriter writer;
3252
15.4M
    _PyUnicodeWriter_Init(&writer);
3253
3254
15.4M
    if (unicode_from_format(&writer, format, vargs) < 0) {
3255
0
        _PyUnicodeWriter_Dealloc(&writer);
3256
0
        return NULL;
3257
0
    }
3258
15.4M
    return _PyUnicodeWriter_Finish(&writer);
3259
15.4M
}
3260
3261
PyObject *
3262
PyUnicode_FromFormat(const char *format, ...)
3263
13.7k
{
3264
13.7k
    PyObject* ret;
3265
13.7k
    va_list vargs;
3266
3267
13.7k
    va_start(vargs, format);
3268
13.7k
    ret = PyUnicode_FromFormatV(format, vargs);
3269
13.7k
    va_end(vargs);
3270
13.7k
    return ret;
3271
13.7k
}
3272
3273
int
3274
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3275
0
{
3276
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3277
0
    Py_ssize_t old_pos = _writer->pos;
3278
3279
0
    va_list vargs;
3280
0
    va_start(vargs, format);
3281
0
    int res = unicode_from_format(_writer, format, vargs);
3282
0
    va_end(vargs);
3283
3284
0
    if (res < 0) {
3285
0
        _writer->pos = old_pos;
3286
0
    }
3287
0
    return res;
3288
0
}
3289
3290
static Py_ssize_t
3291
unicode_get_widechar_size(PyObject *unicode)
3292
1.72k
{
3293
1.72k
    Py_ssize_t res;
3294
3295
1.72k
    assert(unicode != NULL);
3296
1.72k
    assert(_PyUnicode_CHECK(unicode));
3297
3298
1.72k
    res = _PyUnicode_LENGTH(unicode);
3299
#if SIZEOF_WCHAR_T == 2
3300
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3301
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3302
        const Py_UCS4 *end = s + res;
3303
        for (; s < end; ++s) {
3304
            if (*s > 0xFFFF) {
3305
                ++res;
3306
            }
3307
        }
3308
    }
3309
#endif
3310
1.72k
    return res;
3311
1.72k
}
3312
3313
static void
3314
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3315
1.72k
{
3316
1.72k
    assert(unicode != NULL);
3317
1.72k
    assert(_PyUnicode_CHECK(unicode));
3318
3319
1.72k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3320
0
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3321
0
        return;
3322
0
    }
3323
3324
1.72k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3325
1.72k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3326
109k
        for (; size--; ++s, ++w) {
3327
107k
            *w = *s;
3328
107k
        }
3329
1.72k
    }
3330
0
    else {
3331
0
#if SIZEOF_WCHAR_T == 4
3332
0
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3333
0
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3334
0
        for (; size--; ++s, ++w) {
3335
0
            *w = *s;
3336
0
        }
3337
#else
3338
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3339
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3340
        for (; size--; ++s, ++w) {
3341
            Py_UCS4 ch = *s;
3342
            if (ch > 0xFFFF) {
3343
                assert(ch <= MAX_UNICODE);
3344
                /* encode surrogate pair in this case */
3345
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3346
                if (!size--)
3347
                    break;
3348
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3349
            }
3350
            else {
3351
                *w = ch;
3352
            }
3353
        }
3354
#endif
3355
0
    }
3356
1.72k
}
3357
3358
#ifdef HAVE_WCHAR_H
3359
3360
/* Convert a Unicode object to a wide character string.
3361
3362
   - If w is NULL: return the number of wide characters (including the null
3363
     character) required to convert the unicode object. Ignore size argument.
3364
3365
   - Otherwise: return the number of wide characters (excluding the null
3366
     character) written into w. Write at most size wide characters (including
3367
     the null character). */
3368
Py_ssize_t
3369
PyUnicode_AsWideChar(PyObject *unicode,
3370
                     wchar_t *w,
3371
                     Py_ssize_t size)
3372
461
{
3373
461
    Py_ssize_t res;
3374
3375
461
    if (unicode == NULL) {
3376
0
        PyErr_BadInternalCall();
3377
0
        return -1;
3378
0
    }
3379
461
    if (!PyUnicode_Check(unicode)) {
3380
0
        PyErr_BadArgument();
3381
0
        return -1;
3382
0
    }
3383
3384
461
    res = unicode_get_widechar_size(unicode);
3385
461
    if (w == NULL) {
3386
0
        return res + 1;
3387
0
    }
3388
3389
461
    if (size > res) {
3390
461
        size = res + 1;
3391
461
    }
3392
0
    else {
3393
0
        res = size;
3394
0
    }
3395
461
    unicode_copy_as_widechar(unicode, w, size);
3396
3397
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3398
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3399
       non-Unicode locales and hence needs conversion first. */
3400
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3401
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3402
            return -1;
3403
        }
3404
    }
3405
#endif
3406
3407
461
    return res;
3408
461
}
3409
3410
wchar_t*
3411
PyUnicode_AsWideCharString(PyObject *unicode,
3412
                           Py_ssize_t *size)
3413
1.26k
{
3414
1.26k
    wchar_t *buffer;
3415
1.26k
    Py_ssize_t buflen;
3416
3417
1.26k
    if (unicode == NULL) {
3418
0
        PyErr_BadInternalCall();
3419
0
        return NULL;
3420
0
    }
3421
1.26k
    if (!PyUnicode_Check(unicode)) {
3422
0
        PyErr_BadArgument();
3423
0
        return NULL;
3424
0
    }
3425
3426
1.26k
    buflen = unicode_get_widechar_size(unicode);
3427
1.26k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3428
1.26k
    if (buffer == NULL) {
3429
0
        PyErr_NoMemory();
3430
0
        return NULL;
3431
0
    }
3432
1.26k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3433
3434
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3435
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3436
       non-Unicode locales and hence needs conversion first. */
3437
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3438
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3439
            return NULL;
3440
        }
3441
    }
3442
#endif
3443
3444
1.26k
    if (size != NULL) {
3445
814
        *size = buflen;
3446
814
    }
3447
448
    else if (wcslen(buffer) != (size_t)buflen) {
3448
0
        PyMem_Free(buffer);
3449
0
        PyErr_SetString(PyExc_ValueError,
3450
0
                        "embedded null character");
3451
0
        return NULL;
3452
0
    }
3453
1.26k
    return buffer;
3454
1.26k
}
3455
3456
#endif /* HAVE_WCHAR_H */
3457
3458
int
3459
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3460
0
{
3461
0
    wchar_t **p = (wchar_t **)ptr;
3462
0
    if (obj == NULL) {
3463
0
        PyMem_Free(*p);
3464
0
        *p = NULL;
3465
0
        return 1;
3466
0
    }
3467
0
    if (PyUnicode_Check(obj)) {
3468
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3469
0
        if (*p == NULL) {
3470
0
            return 0;
3471
0
        }
3472
0
        return Py_CLEANUP_SUPPORTED;
3473
0
    }
3474
0
    PyErr_Format(PyExc_TypeError,
3475
0
                 "argument must be str, not %.50s",
3476
0
                 Py_TYPE(obj)->tp_name);
3477
0
    return 0;
3478
0
}
3479
3480
int
3481
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3482
0
{
3483
0
    wchar_t **p = (wchar_t **)ptr;
3484
0
    if (obj == NULL) {
3485
0
        PyMem_Free(*p);
3486
0
        *p = NULL;
3487
0
        return 1;
3488
0
    }
3489
0
    if (obj == Py_None) {
3490
0
        *p = NULL;
3491
0
        return 1;
3492
0
    }
3493
0
    if (PyUnicode_Check(obj)) {
3494
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3495
0
        if (*p == NULL) {
3496
0
            return 0;
3497
0
        }
3498
0
        return Py_CLEANUP_SUPPORTED;
3499
0
    }
3500
0
    PyErr_Format(PyExc_TypeError,
3501
0
                 "argument must be str or None, not %.50s",
3502
0
                 Py_TYPE(obj)->tp_name);
3503
0
    return 0;
3504
0
}
3505
3506
PyObject *
3507
PyUnicode_FromOrdinal(int ordinal)
3508
240k
{
3509
240k
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3510
0
        PyErr_SetString(PyExc_ValueError,
3511
0
                        "chr() arg not in range(0x110000)");
3512
0
        return NULL;
3513
0
    }
3514
3515
240k
    return unicode_char((Py_UCS4)ordinal);
3516
240k
}
3517
3518
PyObject *
3519
PyUnicode_FromObject(PyObject *obj)
3520
10.3M
{
3521
    /* XXX Perhaps we should make this API an alias of
3522
       PyObject_Str() instead ?! */
3523
10.3M
    if (PyUnicode_CheckExact(obj)) {
3524
10.3M
        return Py_NewRef(obj);
3525
10.3M
    }
3526
0
    if (PyUnicode_Check(obj)) {
3527
        /* For a Unicode subtype that's not a Unicode object,
3528
           return a true Unicode object with the same data. */
3529
0
        return _PyUnicode_Copy(obj);
3530
0
    }
3531
0
    PyErr_Format(PyExc_TypeError,
3532
0
                 "Can't convert '%.100s' object to str implicitly",
3533
0
                 Py_TYPE(obj)->tp_name);
3534
0
    return NULL;
3535
0
}
3536
3537
PyObject *
3538
PyUnicode_FromEncodedObject(PyObject *obj,
3539
                            const char *encoding,
3540
                            const char *errors)
3541
5.21M
{
3542
5.21M
    Py_buffer buffer;
3543
5.21M
    PyObject *v;
3544
3545
5.21M
    if (obj == NULL) {
3546
0
        PyErr_BadInternalCall();
3547
0
        return NULL;
3548
0
    }
3549
3550
    /* Decoding bytes objects is the most common case and should be fast */
3551
5.21M
    if (PyBytes_Check(obj)) {
3552
4.73M
        if (PyBytes_GET_SIZE(obj) == 0) {
3553
651k
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3554
0
                return NULL;
3555
0
            }
3556
651k
            _Py_RETURN_UNICODE_EMPTY();
3557
651k
        }
3558
4.08M
        return PyUnicode_Decode(
3559
4.08M
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3560
4.08M
                encoding, errors);
3561
4.73M
    }
3562
3563
477k
    if (PyUnicode_Check(obj)) {
3564
0
        PyErr_SetString(PyExc_TypeError,
3565
0
                        "decoding str is not supported");
3566
0
        return NULL;
3567
0
    }
3568
3569
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3570
477k
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3571
0
        PyErr_Format(PyExc_TypeError,
3572
0
                     "decoding to str: need a bytes-like object, %.80s found",
3573
0
                     Py_TYPE(obj)->tp_name);
3574
0
        return NULL;
3575
0
    }
3576
3577
477k
    if (buffer.len == 0) {
3578
0
        PyBuffer_Release(&buffer);
3579
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3580
0
            return NULL;
3581
0
        }
3582
0
        _Py_RETURN_UNICODE_EMPTY();
3583
0
    }
3584
3585
477k
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3586
477k
    PyBuffer_Release(&buffer);
3587
477k
    return v;
3588
477k
}
3589
3590
/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3591
   also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3592
   longer than lower_len-1). */
3593
int
3594
_Py_normalize_encoding(const char *encoding,
3595
                       char *lower,
3596
                       size_t lower_len)
3597
10.1M
{
3598
10.1M
    const char *e;
3599
10.1M
    char *l;
3600
10.1M
    char *l_end;
3601
10.1M
    int punct;
3602
3603
10.1M
    assert(encoding != NULL);
3604
3605
10.1M
    e = encoding;
3606
10.1M
    l = lower;
3607
10.1M
    l_end = &lower[lower_len - 1];
3608
10.1M
    punct = 0;
3609
167M
    while (1) {
3610
167M
        char c = *e;
3611
167M
        if (c == 0) {
3612
9.31M
            break;
3613
9.31M
        }
3614
3615
158M
        if (Py_ISALNUM(c) || c == '.') {
3616
75.0M
            if (punct && l != lower) {
3617
9.16M
                if (l == l_end) {
3618
2.72k
                    return 0;
3619
2.72k
                }
3620
9.15M
                *l++ = '_';
3621
9.15M
            }
3622
74.9M
            punct = 0;
3623
3624
74.9M
            if (l == l_end) {
3625
834k
                return 0;
3626
834k
            }
3627
74.1M
            *l++ = Py_TOLOWER(c);
3628
74.1M
        }
3629
83.5M
        else {
3630
83.5M
            punct = 1;
3631
83.5M
        }
3632
3633
157M
        e++;
3634
157M
    }
3635
9.31M
    *l = '\0';
3636
9.31M
    return 1;
3637
10.1M
}
3638
3639
PyObject *
3640
PyUnicode_Decode(const char *s,
3641
                 Py_ssize_t size,
3642
                 const char *encoding,
3643
                 const char *errors)
3644
4.56M
{
3645
4.56M
    PyObject *buffer = NULL, *unicode;
3646
4.56M
    Py_buffer info;
3647
4.56M
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3648
3649
4.56M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3650
0
        return NULL;
3651
0
    }
3652
3653
4.56M
    if (size == 0) {
3654
0
        _Py_RETURN_UNICODE_EMPTY();
3655
0
    }
3656
3657
4.56M
    if (encoding == NULL) {
3658
34.6k
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3659
34.6k
    }
3660
3661
    /* Shortcuts for common default encodings */
3662
4.52M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3663
4.51M
        char *lower = buflower;
3664
3665
        /* Fast paths */
3666
4.51M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3667
799k
            lower += 3;
3668
799k
            if (*lower == '_') {
3669
                /* Match "utf8" and "utf_8" */
3670
799k
                lower++;
3671
799k
            }
3672
3673
799k
            if (lower[0] == '8' && lower[1] == 0) {
3674
799k
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3675
799k
            }
3676
752
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3677
130
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3678
130
            }
3679
622
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3680
63
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3681
63
            }
3682
799k
        }
3683
3.71M
        else {
3684
3.71M
            if (strcmp(lower, "ascii") == 0
3685
3.71M
                || strcmp(lower, "us_ascii") == 0) {
3686
835k
                return PyUnicode_DecodeASCII(s, size, errors);
3687
835k
            }
3688
    #ifdef MS_WINDOWS
3689
            else if (strcmp(lower, "mbcs") == 0) {
3690
                return PyUnicode_DecodeMBCS(s, size, errors);
3691
            }
3692
    #endif
3693
2.88M
            else if (strcmp(lower, "latin1") == 0
3694
2.88M
                     || strcmp(lower, "latin_1") == 0
3695
2.88M
                     || strcmp(lower, "iso_8859_1") == 0
3696
2.88M
                     || strcmp(lower, "iso8859_1") == 0) {
3697
2.55M
                return PyUnicode_DecodeLatin1(s, size, errors);
3698
2.55M
            }
3699
3.71M
        }
3700
4.51M
    }
3701
3702
    /* Decode via the codec registry */
3703
343k
    buffer = NULL;
3704
343k
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3705
0
        goto onError;
3706
343k
    buffer = PyMemoryView_FromBuffer(&info);
3707
343k
    if (buffer == NULL)
3708
0
        goto onError;
3709
343k
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3710
343k
    if (unicode == NULL)
3711
145k
        goto onError;
3712
197k
    if (!PyUnicode_Check(unicode)) {
3713
0
        PyErr_Format(PyExc_TypeError,
3714
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3715
0
                     "use codecs.decode() to decode to arbitrary types",
3716
0
                     encoding,
3717
0
                     Py_TYPE(unicode)->tp_name);
3718
0
        Py_DECREF(unicode);
3719
0
        goto onError;
3720
0
    }
3721
197k
    Py_DECREF(buffer);
3722
197k
    return unicode_result(unicode);
3723
3724
145k
  onError:
3725
145k
    Py_XDECREF(buffer);
3726
145k
    return NULL;
3727
197k
}
3728
3729
PyAPI_FUNC(PyObject *)
3730
PyUnicode_AsDecodedObject(PyObject *unicode,
3731
                          const char *encoding,
3732
                          const char *errors)
3733
0
{
3734
0
    if (!PyUnicode_Check(unicode)) {
3735
0
        PyErr_BadArgument();
3736
0
        return NULL;
3737
0
    }
3738
3739
0
    if (encoding == NULL)
3740
0
        encoding = PyUnicode_GetDefaultEncoding();
3741
3742
    /* Decode via the codec registry */
3743
0
    return PyCodec_Decode(unicode, encoding, errors);
3744
0
}
3745
3746
PyAPI_FUNC(PyObject *)
3747
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3748
                           const char *encoding,
3749
                           const char *errors)
3750
0
{
3751
0
    PyObject *v;
3752
3753
0
    if (!PyUnicode_Check(unicode)) {
3754
0
        PyErr_BadArgument();
3755
0
        goto onError;
3756
0
    }
3757
3758
0
    if (encoding == NULL)
3759
0
        encoding = PyUnicode_GetDefaultEncoding();
3760
3761
    /* Decode via the codec registry */
3762
0
    v = PyCodec_Decode(unicode, encoding, errors);
3763
0
    if (v == NULL)
3764
0
        goto onError;
3765
0
    if (!PyUnicode_Check(v)) {
3766
0
        PyErr_Format(PyExc_TypeError,
3767
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3768
0
                     "use codecs.decode() to decode to arbitrary types",
3769
0
                     encoding,
3770
0
                     Py_TYPE(unicode)->tp_name);
3771
0
        Py_DECREF(v);
3772
0
        goto onError;
3773
0
    }
3774
0
    return unicode_result(v);
3775
3776
0
  onError:
3777
0
    return NULL;
3778
0
}
3779
3780
PyAPI_FUNC(PyObject *)
3781
PyUnicode_AsEncodedObject(PyObject *unicode,
3782
                          const char *encoding,
3783
                          const char *errors)
3784
0
{
3785
0
    PyObject *v;
3786
3787
0
    if (!PyUnicode_Check(unicode)) {
3788
0
        PyErr_BadArgument();
3789
0
        goto onError;
3790
0
    }
3791
3792
0
    if (encoding == NULL)
3793
0
        encoding = PyUnicode_GetDefaultEncoding();
3794
3795
    /* Encode via the codec registry */
3796
0
    v = PyCodec_Encode(unicode, encoding, errors);
3797
0
    if (v == NULL)
3798
0
        goto onError;
3799
0
    return v;
3800
3801
0
  onError:
3802
0
    return NULL;
3803
0
}
3804
3805
3806
static PyObject *
3807
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3808
                      int current_locale)
3809
414
{
3810
414
    Py_ssize_t wlen;
3811
414
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3812
414
    if (wstr == NULL) {
3813
0
        return NULL;
3814
0
    }
3815
3816
414
    if ((size_t)wlen != wcslen(wstr)) {
3817
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3818
0
        PyMem_Free(wstr);
3819
0
        return NULL;
3820
0
    }
3821
3822
414
    char *str;
3823
414
    size_t error_pos;
3824
414
    const char *reason;
3825
414
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3826
414
                                 current_locale, error_handler);
3827
414
    PyMem_Free(wstr);
3828
3829
414
    if (res != 0) {
3830
0
        if (res == -2) {
3831
0
            PyObject *exc;
3832
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3833
0
                    "locale", unicode,
3834
0
                    (Py_ssize_t)error_pos,
3835
0
                    (Py_ssize_t)(error_pos+1),
3836
0
                    reason);
3837
0
            if (exc != NULL) {
3838
0
                PyCodec_StrictErrors(exc);
3839
0
                Py_DECREF(exc);
3840
0
            }
3841
0
        }
3842
0
        else if (res == -3) {
3843
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3844
0
        }
3845
0
        else {
3846
0
            PyErr_NoMemory();
3847
0
        }
3848
0
        return NULL;
3849
0
    }
3850
3851
414
    PyObject *bytes = PyBytes_FromString(str);
3852
414
    PyMem_RawFree(str);
3853
414
    return bytes;
3854
414
}
3855
3856
PyObject *
3857
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3858
0
{
3859
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3860
0
    return unicode_encode_locale(unicode, error_handler, 1);
3861
0
}
3862
3863
PyObject *
3864
PyUnicode_EncodeFSDefault(PyObject *unicode)
3865
17.1k
{
3866
17.1k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3867
17.1k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3868
17.1k
    if (fs_codec->utf8) {
3869
16.7k
        return unicode_encode_utf8(unicode,
3870
16.7k
                                   fs_codec->error_handler,
3871
16.7k
                                   fs_codec->errors);
3872
16.7k
    }
3873
414
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3874
414
    else if (fs_codec->encoding) {
3875
0
        return PyUnicode_AsEncodedString(unicode,
3876
0
                                         fs_codec->encoding,
3877
0
                                         fs_codec->errors);
3878
0
    }
3879
414
#endif
3880
414
    else {
3881
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3882
           machinery is not ready and so cannot be used:
3883
           use wcstombs() in this case. */
3884
414
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3885
414
        const wchar_t *filesystem_errors = config->filesystem_errors;
3886
414
        assert(filesystem_errors != NULL);
3887
414
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3888
414
        assert(errors != _Py_ERROR_UNKNOWN);
3889
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3890
        return unicode_encode_utf8(unicode, errors, NULL);
3891
#else
3892
414
        return unicode_encode_locale(unicode, errors, 0);
3893
414
#endif
3894
414
    }
3895
17.1k
}
3896
3897
PyObject *
3898
PyUnicode_AsEncodedString(PyObject *unicode,
3899
                          const char *encoding,
3900
                          const char *errors)
3901
16.0M
{
3902
16.0M
    PyObject *v;
3903
16.0M
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3904
3905
16.0M
    if (!PyUnicode_Check(unicode)) {
3906
0
        PyErr_BadArgument();
3907
0
        return NULL;
3908
0
    }
3909
3910
16.0M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3911
0
        return NULL;
3912
0
    }
3913
3914
16.0M
    if (encoding == NULL) {
3915
11.6M
        return _PyUnicode_AsUTF8String(unicode, errors);
3916
11.6M
    }
3917
3918
    /* Shortcuts for common default encodings */
3919
4.43M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3920
3.60M
        char *lower = buflower;
3921
3922
        /* Fast paths */
3923
3.60M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3924
3.48M
            lower += 3;
3925
3.48M
            if (*lower == '_') {
3926
                /* Match "utf8" and "utf_8" */
3927
3.48M
                lower++;
3928
3.48M
            }
3929
3930
3.48M
            if (lower[0] == '8' && lower[1] == 0) {
3931
3.48M
                return _PyUnicode_AsUTF8String(unicode, errors);
3932
3.48M
            }
3933
0
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3934
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3935
0
            }
3936
0
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3937
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3938
0
            }
3939
3.48M
        }
3940
122k
        else {
3941
122k
            if (strcmp(lower, "ascii") == 0
3942
122k
                || strcmp(lower, "us_ascii") == 0) {
3943
104k
                return _PyUnicode_AsASCIIString(unicode, errors);
3944
104k
            }
3945
#ifdef MS_WINDOWS
3946
            else if (strcmp(lower, "mbcs") == 0) {
3947
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3948
            }
3949
#endif
3950
18.1k
            else if (strcmp(lower, "latin1") == 0 ||
3951
18.1k
                     strcmp(lower, "latin_1") == 0 ||
3952
18.1k
                     strcmp(lower, "iso_8859_1") == 0 ||
3953
18.1k
                     strcmp(lower, "iso8859_1") == 0) {
3954
0
                return _PyUnicode_AsLatin1String(unicode, errors);
3955
0
            }
3956
122k
        }
3957
3.60M
    }
3958
3959
    /* Encode via the codec registry */
3960
845k
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3961
845k
    if (v == NULL)
3962
0
        return NULL;
3963
3964
    /* The normal path */
3965
845k
    if (PyBytes_Check(v))
3966
845k
        return v;
3967
3968
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3969
0
    if (PyByteArray_Check(v)) {
3970
0
        int error;
3971
0
        PyObject *b;
3972
3973
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3974
0
            "encoder %s returned bytearray instead of bytes; "
3975
0
            "use codecs.encode() to encode to arbitrary types",
3976
0
            encoding);
3977
0
        if (error) {
3978
0
            Py_DECREF(v);
3979
0
            return NULL;
3980
0
        }
3981
3982
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3983
0
                                      PyByteArray_GET_SIZE(v));
3984
0
        Py_DECREF(v);
3985
0
        return b;
3986
0
    }
3987
3988
0
    PyErr_Format(PyExc_TypeError,
3989
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3990
0
                 "use codecs.encode() to encode to arbitrary types",
3991
0
                 encoding,
3992
0
                 Py_TYPE(v)->tp_name);
3993
0
    Py_DECREF(v);
3994
0
    return NULL;
3995
0
}
3996
3997
PyAPI_FUNC(PyObject *)
3998
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3999
                           const char *encoding,
4000
                           const char *errors)
4001
0
{
4002
0
    PyObject *v;
4003
4004
0
    if (!PyUnicode_Check(unicode)) {
4005
0
        PyErr_BadArgument();
4006
0
        goto onError;
4007
0
    }
4008
4009
0
    if (encoding == NULL)
4010
0
        encoding = PyUnicode_GetDefaultEncoding();
4011
4012
    /* Encode via the codec registry */
4013
0
    v = PyCodec_Encode(unicode, encoding, errors);
4014
0
    if (v == NULL)
4015
0
        goto onError;
4016
0
    if (!PyUnicode_Check(v)) {
4017
0
        PyErr_Format(PyExc_TypeError,
4018
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
4019
0
                     "use codecs.encode() to encode to arbitrary types",
4020
0
                     encoding,
4021
0
                     Py_TYPE(v)->tp_name);
4022
0
        Py_DECREF(v);
4023
0
        goto onError;
4024
0
    }
4025
0
    return v;
4026
4027
0
  onError:
4028
0
    return NULL;
4029
0
}
4030
4031
static PyObject*
4032
unicode_decode_locale(const char *str, Py_ssize_t len,
4033
                      _Py_error_handler errors, int current_locale)
4034
15.9k
{
4035
15.9k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
4036
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4037
0
        return NULL;
4038
0
    }
4039
4040
15.9k
    wchar_t *wstr;
4041
15.9k
    size_t wlen;
4042
15.9k
    const char *reason;
4043
15.9k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
4044
15.9k
                                 current_locale, errors);
4045
15.9k
    if (res != 0) {
4046
0
        if (res == -2) {
4047
0
            PyObject *exc;
4048
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
4049
0
                                        "locale", str, len,
4050
0
                                        (Py_ssize_t)wlen,
4051
0
                                        (Py_ssize_t)(wlen + 1),
4052
0
                                        reason);
4053
0
            if (exc != NULL) {
4054
0
                PyCodec_StrictErrors(exc);
4055
0
                Py_DECREF(exc);
4056
0
            }
4057
0
        }
4058
0
        else if (res == -3) {
4059
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4060
0
        }
4061
0
        else {
4062
0
            PyErr_NoMemory();
4063
0
        }
4064
0
        return NULL;
4065
0
    }
4066
4067
15.9k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
4068
15.9k
    PyMem_RawFree(wstr);
4069
15.9k
    return unicode;
4070
15.9k
}
4071
4072
PyObject*
4073
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4074
                              const char *errors)
4075
0
{
4076
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4077
0
    return unicode_decode_locale(str, len, error_handler, 1);
4078
0
}
4079
4080
PyObject*
4081
PyUnicode_DecodeLocale(const char *str, const char *errors)
4082
10.8k
{
4083
10.8k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
4084
10.8k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4085
10.8k
    return unicode_decode_locale(str, size, error_handler, 1);
4086
10.8k
}
4087
4088
4089
PyObject*
4090
0
PyUnicode_DecodeFSDefault(const char *s) {
4091
0
    Py_ssize_t size = (Py_ssize_t)strlen(s);
4092
0
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
4093
0
}
4094
4095
PyObject*
4096
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4097
6.72k
{
4098
6.72k
    PyInterpreterState *interp = _PyInterpreterState_GET();
4099
6.72k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4100
6.72k
    if (fs_codec->utf8) {
4101
1.60k
        return unicode_decode_utf8(s, size,
4102
1.60k
                                   fs_codec->error_handler,
4103
1.60k
                                   fs_codec->errors,
4104
1.60k
                                   NULL);
4105
1.60k
    }
4106
5.12k
#ifndef _Py_FORCE_UTF8_FS_ENCODING
4107
5.12k
    else if (fs_codec->encoding) {
4108
0
        return PyUnicode_Decode(s, size,
4109
0
                                fs_codec->encoding,
4110
0
                                fs_codec->errors);
4111
0
    }
4112
5.12k
#endif
4113
5.12k
    else {
4114
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
4115
           machinery is not ready and so cannot be used:
4116
           use mbstowcs() in this case. */
4117
5.12k
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4118
5.12k
        const wchar_t *filesystem_errors = config->filesystem_errors;
4119
5.12k
        assert(filesystem_errors != NULL);
4120
5.12k
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4121
5.12k
        assert(errors != _Py_ERROR_UNKNOWN);
4122
#ifdef _Py_FORCE_UTF8_FS_ENCODING
4123
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
4124
#else
4125
5.12k
        return unicode_decode_locale(s, size, errors, 0);
4126
5.12k
#endif
4127
5.12k
    }
4128
6.72k
}
4129
4130
4131
int
4132
PyUnicode_FSConverter(PyObject* arg, void* addr)
4133
11.4k
{
4134
11.4k
    PyObject *path = NULL;
4135
11.4k
    PyObject *output = NULL;
4136
11.4k
    Py_ssize_t size;
4137
11.4k
    const char *data;
4138
11.4k
    if (arg == NULL) {
4139
0
        Py_DECREF(*(PyObject**)addr);
4140
0
        *(PyObject**)addr = NULL;
4141
0
        return 1;
4142
0
    }
4143
11.4k
    path = PyOS_FSPath(arg);
4144
11.4k
    if (path == NULL) {
4145
0
        return 0;
4146
0
    }
4147
11.4k
    if (PyBytes_Check(path)) {
4148
0
        output = path;
4149
0
    }
4150
11.4k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4151
11.4k
        output = PyUnicode_EncodeFSDefault(path);
4152
11.4k
        Py_DECREF(path);
4153
11.4k
        if (!output) {
4154
0
            return 0;
4155
0
        }
4156
11.4k
        assert(PyBytes_Check(output));
4157
11.4k
    }
4158
4159
11.4k
    size = PyBytes_GET_SIZE(output);
4160
11.4k
    data = PyBytes_AS_STRING(output);
4161
11.4k
    if ((size_t)size != strlen(data)) {
4162
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4163
0
        Py_DECREF(output);
4164
0
        return 0;
4165
0
    }
4166
11.4k
    *(PyObject**)addr = output;
4167
11.4k
    return Py_CLEANUP_SUPPORTED;
4168
11.4k
}
4169
4170
4171
int
4172
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4173
20.7k
{
4174
20.7k
    if (arg == NULL) {
4175
0
        Py_DECREF(*(PyObject**)addr);
4176
0
        *(PyObject**)addr = NULL;
4177
0
        return 1;
4178
0
    }
4179
4180
20.7k
    PyObject *path = PyOS_FSPath(arg);
4181
20.7k
    if (path == NULL) {
4182
0
        return 0;
4183
0
    }
4184
4185
20.7k
    PyObject *output = NULL;
4186
20.7k
    if (PyUnicode_Check(path)) {
4187
20.7k
        output = path;
4188
20.7k
    }
4189
0
    else if (PyBytes_Check(path)) {
4190
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4191
0
                                                  PyBytes_GET_SIZE(path));
4192
0
        Py_DECREF(path);
4193
0
        if (!output) {
4194
0
            return 0;
4195
0
        }
4196
0
    }
4197
0
    else {
4198
0
        PyErr_Format(PyExc_TypeError,
4199
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4200
0
                     Py_TYPE(arg)->tp_name);
4201
0
        Py_DECREF(path);
4202
0
        return 0;
4203
0
    }
4204
4205
20.7k
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4206
20.7k
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4207
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4208
0
        Py_DECREF(output);
4209
0
        return 0;
4210
0
    }
4211
20.7k
    *(PyObject**)addr = output;
4212
20.7k
    return Py_CLEANUP_SUPPORTED;
4213
20.7k
}
4214
4215
4216
static int unicode_fill_utf8(PyObject *unicode);
4217
4218
4219
static int
4220
unicode_ensure_utf8(PyObject *unicode)
4221
18.6M
{
4222
18.6M
    int err = 0;
4223
18.6M
    if (PyUnicode_UTF8(unicode) == NULL) {
4224
162k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4225
162k
        if (PyUnicode_UTF8(unicode) == NULL) {
4226
162k
            err = unicode_fill_utf8(unicode);
4227
162k
        }
4228
162k
        Py_END_CRITICAL_SECTION();
4229
162k
    }
4230
18.6M
    return err;
4231
18.6M
}
4232
4233
const char *
4234
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4235
18.6M
{
4236
18.6M
    if (!PyUnicode_Check(unicode)) {
4237
0
        PyErr_BadArgument();
4238
0
        if (psize) {
4239
0
            *psize = -1;
4240
0
        }
4241
0
        return NULL;
4242
0
    }
4243
4244
18.6M
    if (unicode_ensure_utf8(unicode) == -1) {
4245
274
        if (psize) {
4246
274
            *psize = -1;
4247
274
        }
4248
274
        return NULL;
4249
274
    }
4250
4251
18.6M
    if (psize) {
4252
18.6M
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4253
18.6M
    }
4254
18.6M
    return PyUnicode_UTF8(unicode);
4255
18.6M
}
4256
4257
const char *
4258
PyUnicode_AsUTF8(PyObject *unicode)
4259
61.5k
{
4260
61.5k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4261
61.5k
}
4262
4263
const char *
4264
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4265
1.07M
{
4266
1.07M
    Py_ssize_t size;
4267
1.07M
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4268
1.07M
    if (s && strlen(s) != (size_t)size) {
4269
160
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4270
160
        return NULL;
4271
160
    }
4272
1.07M
    return s;
4273
1.07M
}
4274
4275
/*
4276
PyUnicode_GetSize() has been deprecated since Python 3.3
4277
because it returned length of Py_UNICODE.
4278
4279
But this function is part of stable abi, because it doesn't
4280
include Py_UNICODE in signature and it was not excluded from
4281
stable ABI in PEP 384.
4282
*/
4283
PyAPI_FUNC(Py_ssize_t)
4284
PyUnicode_GetSize(PyObject *unicode)
4285
0
{
4286
0
    PyErr_SetString(PyExc_RuntimeError,
4287
0
                    "PyUnicode_GetSize has been removed.");
4288
0
    return -1;
4289
0
}
4290
4291
Py_ssize_t
4292
PyUnicode_GetLength(PyObject *unicode)
4293
32.5k
{
4294
32.5k
    if (!PyUnicode_Check(unicode)) {
4295
0
        PyErr_BadArgument();
4296
0
        return -1;
4297
0
    }
4298
32.5k
    return PyUnicode_GET_LENGTH(unicode);
4299
32.5k
}
4300
4301
Py_UCS4
4302
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4303
30
{
4304
30
    const void *data;
4305
30
    int kind;
4306
4307
30
    if (!PyUnicode_Check(unicode)) {
4308
0
        PyErr_BadArgument();
4309
0
        return (Py_UCS4)-1;
4310
0
    }
4311
30
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4312
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4313
0
        return (Py_UCS4)-1;
4314
0
    }
4315
30
    data = PyUnicode_DATA(unicode);
4316
30
    kind = PyUnicode_KIND(unicode);
4317
30
    return PyUnicode_READ(kind, data, index);
4318
30
}
4319
4320
int
4321
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4322
0
{
4323
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4324
0
        PyErr_BadArgument();
4325
0
        return -1;
4326
0
    }
4327
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4328
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4329
0
        return -1;
4330
0
    }
4331
0
    if (unicode_check_modifiable(unicode))
4332
0
        return -1;
4333
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4334
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4335
0
        return -1;
4336
0
    }
4337
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4338
0
                    index, ch);
4339
0
    return 0;
4340
0
}
4341
4342
const char *
4343
PyUnicode_GetDefaultEncoding(void)
4344
0
{
4345
0
    return "utf-8";
4346
0
}
4347
4348
/* create or adjust a UnicodeDecodeError */
4349
static void
4350
make_decode_exception(PyObject **exceptionObject,
4351
                      const char *encoding,
4352
                      const char *input, Py_ssize_t length,
4353
                      Py_ssize_t startpos, Py_ssize_t endpos,
4354
                      const char *reason)
4355
268k
{
4356
268k
    if (*exceptionObject == NULL) {
4357
76.8k
        *exceptionObject = PyUnicodeDecodeError_Create(
4358
76.8k
            encoding, input, length, startpos, endpos, reason);
4359
76.8k
    }
4360
191k
    else {
4361
191k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4362
0
            goto onError;
4363
191k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4364
0
            goto onError;
4365
191k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4366
0
            goto onError;
4367
191k
    }
4368
268k
    return;
4369
4370
268k
onError:
4371
0
    Py_CLEAR(*exceptionObject);
4372
0
}
4373
4374
#ifdef MS_WINDOWS
4375
static int
4376
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4377
{
4378
    if (newsize > *size) {
4379
        wchar_t *newbuf = *buf;
4380
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4381
            PyErr_NoMemory();
4382
            return -1;
4383
        }
4384
        *buf = newbuf;
4385
    }
4386
    *size = newsize;
4387
    return 0;
4388
}
4389
4390
/* error handling callback helper:
4391
   build arguments, call the callback and check the arguments,
4392
   if no exception occurred, copy the replacement to the output
4393
   and adjust various state variables.
4394
   return 0 on success, -1 on error
4395
*/
4396
4397
static int
4398
unicode_decode_call_errorhandler_wchar(
4399
    const char *errors, PyObject **errorHandler,
4400
    const char *encoding, const char *reason,
4401
    const char **input, const char **inend, Py_ssize_t *startinpos,
4402
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4403
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4404
{
4405
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4406
4407
    PyObject *restuple = NULL;
4408
    PyObject *repunicode = NULL;
4409
    Py_ssize_t outsize;
4410
    Py_ssize_t insize;
4411
    Py_ssize_t requiredsize;
4412
    Py_ssize_t newpos;
4413
    PyObject *inputobj = NULL;
4414
    Py_ssize_t repwlen;
4415
4416
    if (*errorHandler == NULL) {
4417
        *errorHandler = PyCodec_LookupError(errors);
4418
        if (*errorHandler == NULL)
4419
            goto onError;
4420
    }
4421
4422
    make_decode_exception(exceptionObject,
4423
        encoding,
4424
        *input, *inend - *input,
4425
        *startinpos, *endinpos,
4426
        reason);
4427
    if (*exceptionObject == NULL)
4428
        goto onError;
4429
4430
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4431
    if (restuple == NULL)
4432
        goto onError;
4433
    if (!PyTuple_Check(restuple)) {
4434
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4435
        goto onError;
4436
    }
4437
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4438
        goto onError;
4439
4440
    /* Copy back the bytes variables, which might have been modified by the
4441
       callback */
4442
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4443
    if (!inputobj)
4444
        goto onError;
4445
    *input = PyBytes_AS_STRING(inputobj);
4446
    insize = PyBytes_GET_SIZE(inputobj);
4447
    *inend = *input + insize;
4448
    /* we can DECREF safely, as the exception has another reference,
4449
       so the object won't go away. */
4450
    Py_DECREF(inputobj);
4451
4452
    if (newpos<0)
4453
        newpos = insize+newpos;
4454
    if (newpos<0 || newpos>insize) {
4455
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4456
        goto onError;
4457
    }
4458
4459
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4460
    if (repwlen < 0)
4461
        goto onError;
4462
    repwlen--;
4463
    /* need more space? (at least enough for what we
4464
       have+the replacement+the rest of the string (starting
4465
       at the new input position), so we won't have to check space
4466
       when there are no errors in the rest of the string) */
4467
    requiredsize = *outpos;
4468
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4469
        goto overflow;
4470
    requiredsize += repwlen;
4471
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4472
        goto overflow;
4473
    requiredsize += insize - newpos;
4474
    outsize = *bufsize;
4475
    if (requiredsize > outsize) {
4476
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4477
            requiredsize = 2*outsize;
4478
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4479
            goto onError;
4480
        }
4481
    }
4482
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4483
    *outpos += repwlen;
4484
    *endinpos = newpos;
4485
    *inptr = *input + newpos;
4486
4487
    /* we made it! */
4488
    Py_DECREF(restuple);
4489
    return 0;
4490
4491
  overflow:
4492
    PyErr_SetString(PyExc_OverflowError,
4493
                    "decoded result is too long for a Python string");
4494
4495
  onError:
4496
    Py_XDECREF(restuple);
4497
    return -1;
4498
}
4499
#endif   /* MS_WINDOWS */
4500
4501
static int
4502
unicode_decode_call_errorhandler_writer(
4503
    const char *errors, PyObject **errorHandler,
4504
    const char *encoding, const char *reason,
4505
    const char **input, const char **inend, Py_ssize_t *startinpos,
4506
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4507
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4508
268k
{
4509
268k
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4510
4511
268k
    PyObject *restuple = NULL;
4512
268k
    PyObject *repunicode = NULL;
4513
268k
    Py_ssize_t insize;
4514
268k
    Py_ssize_t newpos;
4515
268k
    Py_ssize_t replen;
4516
268k
    Py_ssize_t remain;
4517
268k
    PyObject *inputobj = NULL;
4518
268k
    int need_to_grow = 0;
4519
268k
    const char *new_inptr;
4520
4521
268k
    if (*errorHandler == NULL) {
4522
76.8k
        *errorHandler = PyCodec_LookupError(errors);
4523
76.8k
        if (*errorHandler == NULL)
4524
0
            goto onError;
4525
76.8k
    }
4526
4527
268k
    make_decode_exception(exceptionObject,
4528
268k
        encoding,
4529
268k
        *input, *inend - *input,
4530
268k
        *startinpos, *endinpos,
4531
268k
        reason);
4532
268k
    if (*exceptionObject == NULL)
4533
0
        goto onError;
4534
4535
268k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4536
268k
    if (restuple == NULL)
4537
49.8k
        goto onError;
4538
218k
    if (!PyTuple_Check(restuple)) {
4539
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4540
0
        goto onError;
4541
0
    }
4542
218k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4543
0
        goto onError;
4544
4545
    /* Copy back the bytes variables, which might have been modified by the
4546
       callback */
4547
218k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4548
218k
    if (!inputobj)
4549
0
        goto onError;
4550
218k
    remain = *inend - *input - *endinpos;
4551
218k
    *input = PyBytes_AS_STRING(inputobj);
4552
218k
    insize = PyBytes_GET_SIZE(inputobj);
4553
218k
    *inend = *input + insize;
4554
    /* we can DECREF safely, as the exception has another reference,
4555
       so the object won't go away. */
4556
218k
    Py_DECREF(inputobj);
4557
4558
218k
    if (newpos<0)
4559
0
        newpos = insize+newpos;
4560
218k
    if (newpos<0 || newpos>insize) {
4561
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4562
0
        goto onError;
4563
0
    }
4564
4565
218k
    replen = PyUnicode_GET_LENGTH(repunicode);
4566
218k
    if (replen > 1) {
4567
29.4k
        writer->min_length += replen - 1;
4568
29.4k
        need_to_grow = 1;
4569
29.4k
    }
4570
218k
    new_inptr = *input + newpos;
4571
218k
    if (*inend - new_inptr > remain) {
4572
        /* We don't know the decoding algorithm here so we make the worst
4573
           assumption that one byte decodes to one unicode character.
4574
           If unfortunately one byte could decode to more unicode characters,
4575
           the decoder may write out-of-bound then.  Is it possible for the
4576
           algorithms using this function? */
4577
7.80k
        writer->min_length += *inend - new_inptr - remain;
4578
7.80k
        need_to_grow = 1;
4579
7.80k
    }
4580
218k
    if (need_to_grow) {
4581
29.6k
        writer->overallocate = 1;
4582
29.6k
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4583
29.6k
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4584
0
            goto onError;
4585
29.6k
    }
4586
218k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4587
0
        goto onError;
4588
4589
218k
    *endinpos = newpos;
4590
218k
    *inptr = new_inptr;
4591
4592
    /* we made it! */
4593
218k
    Py_DECREF(restuple);
4594
218k
    return 0;
4595
4596
49.8k
  onError:
4597
49.8k
    Py_XDECREF(restuple);
4598
49.8k
    return -1;
4599
218k
}
4600
4601
/* --- UTF-7 Codec -------------------------------------------------------- */
4602
4603
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4604
4605
/* Three simple macros defining base-64. */
4606
4607
/* Is c a base-64 character? */
4608
4609
#define IS_BASE64(c) \
4610
263k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4611
263k
     ((c) >= 'a' && (c) <= 'z') ||     \
4612
263k
     ((c) >= '0' && (c) <= '9') ||     \
4613
263k
     (c) == '+' || (c) == '/')
4614
4615
/* given that c is a base-64 character, what is its base-64 value? */
4616
4617
#define FROM_BASE64(c)                                                  \
4618
224k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4619
224k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4620
162k
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4621
89.1k
     (c) == '+' ? 62 : 63)
4622
4623
/* What is the base-64 character of the bottom 6 bits of n? */
4624
4625
#define TO_BASE64(n)  \
4626
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4627
4628
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4629
 * decoded as itself.  We are permissive on decoding; the only ASCII
4630
 * byte not decoding to itself is the + which begins a base64
4631
 * string. */
4632
4633
#define DECODE_DIRECT(c)                                \
4634
7.55M
    ((c) <= 127 && (c) != '+')
4635
4636
/* The UTF-7 encoder treats ASCII characters differently according to
4637
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4638
 * the above).  See RFC2152.  This array identifies these different
4639
 * sets:
4640
 * 0 : "Set D"
4641
 *     alphanumeric and '(),-./:?
4642
 * 1 : "Set O"
4643
 *     !"#$%&*;<=>@[]^_`{|}
4644
 * 2 : "whitespace"
4645
 *     ht nl cr sp
4646
 * 3 : special (must be base64 encoded)
4647
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4648
 */
4649
4650
static
4651
char utf7_category[128] = {
4652
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4653
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4654
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4655
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4656
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4657
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4658
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4659
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4660
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4661
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4662
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4663
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4664
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4665
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4666
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4667
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4668
};
4669
4670
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4671
 * answer depends on whether we are encoding set O as itself, and also
4672
 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4673
 * clear that the answers to these questions vary between
4674
 * applications, so this code needs to be flexible.  */
4675
4676
#define ENCODE_DIRECT(c, directO, directWS)             \
4677
0
    ((c) < 128 && (c) > 0 &&                            \
4678
0
     ((utf7_category[(c)] == 0) ||                      \
4679
0
      (directWS && (utf7_category[(c)] == 2)) ||        \
4680
0
      (directO && (utf7_category[(c)] == 1))))
4681
4682
PyObject *
4683
PyUnicode_DecodeUTF7(const char *s,
4684
                     Py_ssize_t size,
4685
                     const char *errors)
4686
0
{
4687
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4688
0
}
4689
4690
/* The decoder.  The only state we preserve is our read position,
4691
 * i.e. how many characters we have consumed.  So if we end in the
4692
 * middle of a shift sequence we have to back off the read position
4693
 * and the output to the beginning of the sequence, otherwise we lose
4694
 * all the shift state (seen bits, number of bits seen, high
4695
 * surrogate). */
4696
4697
PyObject *
4698
PyUnicode_DecodeUTF7Stateful(const char *s,
4699
                             Py_ssize_t size,
4700
                             const char *errors,
4701
                             Py_ssize_t *consumed)
4702
28.2k
{
4703
28.2k
    const char *starts = s;
4704
28.2k
    Py_ssize_t startinpos;
4705
28.2k
    Py_ssize_t endinpos;
4706
28.2k
    const char *e;
4707
28.2k
    _PyUnicodeWriter writer;
4708
28.2k
    const char *errmsg = "";
4709
28.2k
    int inShift = 0;
4710
28.2k
    Py_ssize_t shiftOutStart;
4711
28.2k
    unsigned int base64bits = 0;
4712
28.2k
    unsigned long base64buffer = 0;
4713
28.2k
    Py_UCS4 surrogate = 0;
4714
28.2k
    PyObject *errorHandler = NULL;
4715
28.2k
    PyObject *exc = NULL;
4716
4717
28.2k
    if (size == 0) {
4718
0
        if (consumed)
4719
0
            *consumed = 0;
4720
0
        _Py_RETURN_UNICODE_EMPTY();
4721
0
    }
4722
4723
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4724
28.2k
    _PyUnicodeWriter_Init(&writer);
4725
28.2k
    writer.min_length = size;
4726
4727
28.2k
    shiftOutStart = 0;
4728
28.2k
    e = s + size;
4729
4730
7.83M
    while (s < e) {
4731
7.82M
        Py_UCS4 ch;
4732
7.82M
      restart:
4733
7.82M
        ch = (unsigned char) *s;
4734
4735
7.82M
        if (inShift) { /* in a base-64 section */
4736
241k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4737
224k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4738
224k
                base64bits += 6;
4739
224k
                s++;
4740
224k
                if (base64bits >= 16) {
4741
                    /* we have enough bits for a UTF-16 value */
4742
78.5k
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4743
78.5k
                    base64bits -= 16;
4744
78.5k
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4745
78.5k
                    assert(outCh <= 0xffff);
4746
78.5k
                    if (surrogate) {
4747
                        /* expecting a second surrogate */
4748
8.70k
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4749
2.90k
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4750
2.90k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4751
0
                                goto onError;
4752
2.90k
                            surrogate = 0;
4753
2.90k
                            continue;
4754
2.90k
                        }
4755
5.79k
                        else {
4756
5.79k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4757
0
                                goto onError;
4758
5.79k
                            surrogate = 0;
4759
5.79k
                        }
4760
8.70k
                    }
4761
75.6k
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4762
                        /* first surrogate */
4763
11.5k
                        surrogate = outCh;
4764
11.5k
                    }
4765
64.1k
                    else {
4766
64.1k
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4767
0
                            goto onError;
4768
64.1k
                    }
4769
75.6k
                }
4770
224k
            }
4771
16.1k
            else { /* now leaving a base-64 section */
4772
16.1k
                inShift = 0;
4773
16.1k
                if (base64bits > 0) { /* left-over bits */
4774
12.7k
                    if (base64bits >= 6) {
4775
                        /* We've seen at least one base-64 character */
4776
6.46k
                        s++;
4777
6.46k
                        errmsg = "partial character in shift sequence";
4778
6.46k
                        goto utf7Error;
4779
6.46k
                    }
4780
6.24k
                    else {
4781
                        /* Some bits remain; they should be zero */
4782
6.24k
                        if (base64buffer != 0) {
4783
1.48k
                            s++;
4784
1.48k
                            errmsg = "non-zero padding bits in shift sequence";
4785
1.48k
                            goto utf7Error;
4786
1.48k
                        }
4787
6.24k
                    }
4788
12.7k
                }
4789
8.19k
                if (surrogate && DECODE_DIRECT(ch)) {
4790
2.19k
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4791
0
                        goto onError;
4792
2.19k
                }
4793
8.19k
                surrogate = 0;
4794
8.19k
                if (ch == '-') {
4795
                    /* '-' is absorbed; other terminating
4796
                       characters are preserved */
4797
2.23k
                    s++;
4798
2.23k
                }
4799
8.19k
            }
4800
241k
        }
4801
7.58M
        else if ( ch == '+' ) {
4802
24.6k
            startinpos = s-starts;
4803
24.6k
            s++; /* consume '+' */
4804
24.6k
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4805
2.04k
                s++;
4806
2.04k
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4807
0
                    goto onError;
4808
2.04k
            }
4809
22.6k
            else if (s < e && !IS_BASE64(*s)) {
4810
3.20k
                s++;
4811
3.20k
                errmsg = "ill-formed sequence";
4812
3.20k
                goto utf7Error;
4813
3.20k
            }
4814
19.4k
            else { /* begin base64-encoded section */
4815
19.4k
                inShift = 1;
4816
19.4k
                surrogate = 0;
4817
19.4k
                shiftOutStart = writer.pos;
4818
19.4k
                base64bits = 0;
4819
19.4k
                base64buffer = 0;
4820
19.4k
            }
4821
24.6k
        }
4822
7.55M
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4823
7.45M
            s++;
4824
7.45M
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4825
0
                goto onError;
4826
7.45M
        }
4827
100k
        else {
4828
100k
            startinpos = s-starts;
4829
100k
            s++;
4830
100k
            errmsg = "unexpected special character";
4831
100k
            goto utf7Error;
4832
100k
        }
4833
7.70M
        continue;
4834
7.70M
utf7Error:
4835
111k
        endinpos = s-starts;
4836
111k
        if (unicode_decode_call_errorhandler_writer(
4837
111k
                errors, &errorHandler,
4838
111k
                "utf7", errmsg,
4839
111k
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4840
111k
                &writer))
4841
12.0k
            goto onError;
4842
111k
    }
4843
4844
    /* end of string */
4845
4846
16.1k
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4847
        /* if we're in an inconsistent state, that's an error */
4848
3.26k
        inShift = 0;
4849
3.26k
        if (surrogate ||
4850
3.26k
                (base64bits >= 6) ||
4851
3.26k
                (base64bits > 0 && base64buffer != 0)) {
4852
2.02k
            endinpos = size;
4853
2.02k
            if (unicode_decode_call_errorhandler_writer(
4854
2.02k
                    errors, &errorHandler,
4855
2.02k
                    "utf7", "unterminated shift sequence",
4856
2.02k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4857
2.02k
                    &writer))
4858
1.70k
                goto onError;
4859
328
            if (s < e)
4860
0
                goto restart;
4861
328
        }
4862
3.26k
    }
4863
4864
    /* return state */
4865
14.4k
    if (consumed) {
4866
0
        if (inShift) {
4867
0
            *consumed = startinpos;
4868
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4869
0
                PyObject *result = PyUnicode_FromKindAndData(
4870
0
                        writer.kind, writer.data, shiftOutStart);
4871
0
                Py_XDECREF(errorHandler);
4872
0
                Py_XDECREF(exc);
4873
0
                _PyUnicodeWriter_Dealloc(&writer);
4874
0
                return result;
4875
0
            }
4876
0
            writer.pos = shiftOutStart; /* back off output */
4877
0
        }
4878
0
        else {
4879
0
            *consumed = s-starts;
4880
0
        }
4881
0
    }
4882
4883
14.4k
    Py_XDECREF(errorHandler);
4884
14.4k
    Py_XDECREF(exc);
4885
14.4k
    return _PyUnicodeWriter_Finish(&writer);
4886
4887
13.7k
  onError:
4888
13.7k
    Py_XDECREF(errorHandler);
4889
13.7k
    Py_XDECREF(exc);
4890
13.7k
    _PyUnicodeWriter_Dealloc(&writer);
4891
13.7k
    return NULL;
4892
14.4k
}
4893
4894
4895
PyObject *
4896
_PyUnicode_EncodeUTF7(PyObject *str,
4897
                      int base64SetO,
4898
                      int base64WhiteSpace,
4899
                      const char *errors)
4900
0
{
4901
0
    int kind;
4902
0
    const void *data;
4903
0
    Py_ssize_t len;
4904
0
    PyObject *v;
4905
0
    int inShift = 0;
4906
0
    Py_ssize_t i;
4907
0
    unsigned int base64bits = 0;
4908
0
    unsigned long base64buffer = 0;
4909
0
    char * out;
4910
0
    const char * start;
4911
4912
0
    kind = PyUnicode_KIND(str);
4913
0
    data = PyUnicode_DATA(str);
4914
0
    len = PyUnicode_GET_LENGTH(str);
4915
4916
0
    if (len == 0)
4917
0
        return PyBytes_FromStringAndSize(NULL, 0);
4918
4919
    /* It might be possible to tighten this worst case */
4920
0
    if (len > PY_SSIZE_T_MAX / 8)
4921
0
        return PyErr_NoMemory();
4922
0
    v = PyBytes_FromStringAndSize(NULL, len * 8);
4923
0
    if (v == NULL)
4924
0
        return NULL;
4925
4926
0
    start = out = PyBytes_AS_STRING(v);
4927
0
    for (i = 0; i < len; ++i) {
4928
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4929
4930
0
        if (inShift) {
4931
0
            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4932
                /* shifting out */
4933
0
                if (base64bits) { /* output remaining bits */
4934
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4935
0
                    base64buffer = 0;
4936
0
                    base64bits = 0;
4937
0
                }
4938
0
                inShift = 0;
4939
                /* Characters not in the BASE64 set implicitly unshift the sequence
4940
                   so no '-' is required, except if the character is itself a '-' */
4941
0
                if (IS_BASE64(ch) || ch == '-') {
4942
0
                    *out++ = '-';
4943
0
                }
4944
0
                *out++ = (char) ch;
4945
0
            }
4946
0
            else {
4947
0
                goto encode_char;
4948
0
            }
4949
0
        }
4950
0
        else { /* not in a shift sequence */
4951
0
            if (ch == '+') {
4952
0
                *out++ = '+';
4953
0
                        *out++ = '-';
4954
0
            }
4955
0
            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4956
0
                *out++ = (char) ch;
4957
0
            }
4958
0
            else {
4959
0
                *out++ = '+';
4960
0
                inShift = 1;
4961
0
                goto encode_char;
4962
0
            }
4963
0
        }
4964
0
        continue;
4965
0
encode_char:
4966
0
        if (ch >= 0x10000) {
4967
0
            assert(ch <= MAX_UNICODE);
4968
4969
            /* code first surrogate */
4970
0
            base64bits += 16;
4971
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4972
0
            while (base64bits >= 6) {
4973
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4974
0
                base64bits -= 6;
4975
0
            }
4976
            /* prepare second surrogate */
4977
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4978
0
        }
4979
0
        base64bits += 16;
4980
0
        base64buffer = (base64buffer << 16) | ch;
4981
0
        while (base64bits >= 6) {
4982
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4983
0
            base64bits -= 6;
4984
0
        }
4985
0
    }
4986
0
    if (base64bits)
4987
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4988
0
    if (inShift)
4989
0
        *out++ = '-';
4990
0
    if (_PyBytes_Resize(&v, out - start) < 0)
4991
0
        return NULL;
4992
0
    return v;
4993
0
}
4994
4995
#undef IS_BASE64
4996
#undef FROM_BASE64
4997
#undef TO_BASE64
4998
#undef DECODE_DIRECT
4999
#undef ENCODE_DIRECT
5000
5001
/* --- UTF-8 Codec -------------------------------------------------------- */
5002
5003
PyObject *
5004
PyUnicode_DecodeUTF8(const char *s,
5005
                     Py_ssize_t size,
5006
                     const char *errors)
5007
2.27M
{
5008
2.27M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5009
2.27M
}
5010
5011
#include "stringlib/asciilib.h"
5012
#include "stringlib/codecs.h"
5013
#include "stringlib/undef.h"
5014
5015
#include "stringlib/ucs1lib.h"
5016
#include "stringlib/codecs.h"
5017
#include "stringlib/undef.h"
5018
5019
#include "stringlib/ucs2lib.h"
5020
#include "stringlib/codecs.h"
5021
#include "stringlib/undef.h"
5022
5023
#include "stringlib/ucs4lib.h"
5024
#include "stringlib/codecs.h"
5025
#include "stringlib/undef.h"
5026
5027
#if (SIZEOF_SIZE_T == 8)
5028
/* Mask to quickly check whether a C 'size_t' contains a
5029
   non-ASCII, UTF8-encoded char. */
5030
115M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
5031
// used to count codepoints in UTF-8 string.
5032
259M
# define VECTOR_0101     0x0101010101010101ULL
5033
2.08M
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
5034
#elif (SIZEOF_SIZE_T == 4)
5035
# define ASCII_CHAR_MASK 0x80808080U
5036
# define VECTOR_0101     0x01010101U
5037
# define VECTOR_00FF     0x00ff00ffU
5038
#else
5039
# error C 'size_t' size should be either 4 or 8!
5040
#endif
5041
5042
#if (defined(__clang__) || defined(__GNUC__))
5043
#define HAVE_CTZ 1
5044
static inline unsigned int
5045
ctz(size_t v)
5046
416k
{
5047
416k
    return __builtin_ctzll((unsigned long long)v);
5048
416k
}
5049
#elif defined(_MSC_VER)
5050
#define HAVE_CTZ 1
5051
static inline unsigned int
5052
ctz(size_t v)
5053
{
5054
    unsigned long pos;
5055
#if SIZEOF_SIZE_T == 4
5056
    _BitScanForward(&pos, v);
5057
#else
5058
    _BitScanForward64(&pos, v);
5059
#endif /* SIZEOF_SIZE_T */
5060
    return pos;
5061
}
5062
#else
5063
#define HAVE_CTZ 0
5064
#endif
5065
5066
#if HAVE_CTZ && PY_LITTLE_ENDIAN
5067
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
5068
static size_t
5069
load_unaligned(const unsigned char *p, size_t size)
5070
14.5M
{
5071
14.5M
    union {
5072
14.5M
        size_t s;
5073
14.5M
        unsigned char b[SIZEOF_SIZE_T];
5074
14.5M
    } u;
5075
14.5M
    u.s = 0;
5076
    // This switch statement assumes little endian because:
5077
    // * union is faster than bitwise or and shift.
5078
    // * big endian machine is rare and hard to maintain.
5079
14.5M
    switch (size) {
5080
0
    default:
5081
0
#if SIZEOF_SIZE_T == 8
5082
0
    case 8:
5083
0
        u.b[7] = p[7];
5084
0
        _Py_FALLTHROUGH;
5085
787k
    case 7:
5086
787k
        u.b[6] = p[6];
5087
787k
        _Py_FALLTHROUGH;
5088
3.25M
    case 6:
5089
3.25M
        u.b[5] = p[5];
5090
3.25M
        _Py_FALLTHROUGH;
5091
3.82M
    case 5:
5092
3.82M
        u.b[4] = p[4];
5093
3.82M
        _Py_FALLTHROUGH;
5094
3.82M
#endif
5095
4.30M
    case 4:
5096
4.30M
        u.b[3] = p[3];
5097
4.30M
        _Py_FALLTHROUGH;
5098
10.4M
    case 3:
5099
10.4M
        u.b[2] = p[2];
5100
10.4M
        _Py_FALLTHROUGH;
5101
14.1M
    case 2:
5102
14.1M
        u.b[1] = p[1];
5103
14.1M
        _Py_FALLTHROUGH;
5104
14.3M
    case 1:
5105
14.3M
        u.b[0] = p[0];
5106
14.3M
        break;
5107
191k
    case 0:
5108
191k
        break;
5109
14.5M
    }
5110
14.5M
    return u.s;
5111
14.5M
}
5112
#endif
5113
5114
/*
5115
 * Find the first non-ASCII character in a byte sequence.
5116
 *
5117
 * This function scans a range of bytes from `start` to `end` and returns the
5118
 * index of the first byte that is not an ASCII character (i.e., has the most
5119
 * significant bit set). If all characters in the range are ASCII, it returns
5120
 * `end - start`.
5121
 */
5122
static Py_ssize_t
5123
find_first_nonascii(const unsigned char *start, const unsigned char *end)
5124
14.8M
{
5125
    // The search is done in `size_t` chunks.
5126
    // The start and end might not be aligned at `size_t` boundaries,
5127
    // so they're handled specially.
5128
5129
14.8M
    const unsigned char *p = start;
5130
5131
14.8M
    if (end - start >= SIZEOF_SIZE_T) {
5132
        // Avoid unaligned read.
5133
3.54M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5134
3.54M
        size_t u;
5135
3.54M
        memcpy(&u, p, sizeof(size_t));
5136
3.54M
        u &= ASCII_CHAR_MASK;
5137
3.54M
        if (u) {
5138
141k
            return (ctz(u) - 7) / 8;
5139
141k
        }
5140
3.40M
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
5141
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
5142
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
5143
        while (p < p2) {
5144
            if (*p & 0x80) {
5145
                return p - start;
5146
            }
5147
            p++;
5148
        }
5149
#endif
5150
5151
3.40M
        const unsigned char *e = end - SIZEOF_SIZE_T;
5152
99.0M
        while (p <= e) {
5153
95.7M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5154
95.7M
            if (u) {
5155
98.1k
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5156
98.1k
                return p - start + (ctz(u) - 7) / 8;
5157
#else
5158
                // big endian and minor compilers are difficult to test.
5159
                // fallback to per byte check.
5160
                break;
5161
#endif
5162
98.1k
            }
5163
95.6M
            p += SIZEOF_SIZE_T;
5164
95.6M
        }
5165
3.40M
    }
5166
14.5M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5167
14.5M
    assert((end - p) < SIZEOF_SIZE_T);
5168
    // we can not use *(const size_t*)p to avoid buffer overrun.
5169
14.5M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5170
14.5M
    if (u) {
5171
177k
        return p - start + (ctz(u) - 7) / 8;
5172
177k
    }
5173
14.4M
    return end - start;
5174
#else
5175
    while (p < end) {
5176
        if (*p & 0x80) {
5177
            break;
5178
        }
5179
        p++;
5180
    }
5181
    return p - start;
5182
#endif
5183
14.5M
}
5184
5185
static inline int
5186
scalar_utf8_start_char(unsigned int ch)
5187
356k
{
5188
    // 0xxxxxxx or 11xxxxxx are first byte.
5189
356k
    return (~ch >> 7 | ch >> 6) & 1;
5190
356k
}
5191
5192
static inline size_t
5193
vector_utf8_start_chars(size_t v)
5194
259M
{
5195
259M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5196
259M
}
5197
5198
5199
// Count the number of UTF-8 code points in a given byte sequence.
5200
static Py_ssize_t
5201
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5202
97.8k
{
5203
97.8k
    Py_ssize_t len = 0;
5204
5205
97.8k
    if (end - s >= SIZEOF_SIZE_T) {
5206
49.5k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5207
19.2k
            len += scalar_utf8_start_char(*s++);
5208
19.2k
        }
5209
5210
1.07M
        while (s + SIZEOF_SIZE_T <= end) {
5211
1.04M
            const unsigned char *e = end;
5212
1.04M
            if (e - s > SIZEOF_SIZE_T * 255) {
5213
1.01M
                e = s + SIZEOF_SIZE_T * 255;
5214
1.01M
            }
5215
1.04M
            Py_ssize_t vstart = 0;
5216
260M
            while (s + SIZEOF_SIZE_T <= e) {
5217
259M
                size_t v = *(size_t*)s;
5218
259M
                size_t vs = vector_utf8_start_chars(v);
5219
259M
                vstart += vs;
5220
259M
                s += SIZEOF_SIZE_T;
5221
259M
            }
5222
1.04M
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5223
1.04M
            vstart += vstart >> 16;
5224
1.04M
#if SIZEOF_SIZE_T == 8
5225
1.04M
            vstart += vstart >> 32;
5226
1.04M
#endif
5227
1.04M
            len += vstart & 0x7ff;
5228
1.04M
        }
5229
30.2k
    }
5230
435k
    while (s < end) {
5231
337k
        len += scalar_utf8_start_char(*s++);
5232
337k
    }
5233
97.8k
    return len;
5234
97.8k
}
5235
5236
static Py_ssize_t
5237
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5238
6.23M
{
5239
6.23M
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5240
6.23M
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5241
6.23M
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5242
843k
    {
5243
        /* Fast path, see in STRINGLIB(utf8_decode) for
5244
           an explanation. */
5245
843k
        const char *p = start;
5246
843k
        Py_UCS1 *q = dest;
5247
1.81M
        while (p + SIZEOF_SIZE_T <= end) {
5248
1.09M
            size_t value = *(const size_t *) p;
5249
1.09M
            if (value & ASCII_CHAR_MASK)
5250
123k
                break;
5251
975k
            *((size_t *)q) = value;
5252
975k
            p += SIZEOF_SIZE_T;
5253
975k
            q += SIZEOF_SIZE_T;
5254
975k
        }
5255
3.75M
        while (p < end) {
5256
3.05M
            if ((unsigned char)*p & 0x80)
5257
145k
                break;
5258
2.91M
            *q++ = *p++;
5259
2.91M
        }
5260
843k
        return p - start;
5261
843k
    }
5262
5.38M
#endif
5263
5.38M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5264
5.38M
                                         (const unsigned char*)end);
5265
5.38M
    memcpy(dest, start, pos);
5266
5.38M
    return pos;
5267
6.23M
}
5268
5269
static int
5270
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5271
                         const char *starts, const char *s, const char *end,
5272
                         _Py_error_handler error_handler,
5273
                         const char *errors,
5274
                         Py_ssize_t *consumed)
5275
418k
{
5276
418k
    Py_ssize_t startinpos, endinpos;
5277
418k
    const char *errmsg = "";
5278
418k
    PyObject *error_handler_obj = NULL;
5279
418k
    PyObject *exc = NULL;
5280
5281
167M
    while (s < end) {
5282
167M
        Py_UCS4 ch;
5283
167M
        int kind = writer->kind;
5284
5285
167M
        if (kind == PyUnicode_1BYTE_KIND) {
5286
414k
            if (PyUnicode_IS_ASCII(writer->buffer))
5287
318k
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5288
95.6k
            else
5289
95.6k
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5290
166M
        } else if (kind == PyUnicode_2BYTE_KIND) {
5291
84.1M
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5292
84.1M
        } else {
5293
82.5M
            assert(kind == PyUnicode_4BYTE_KIND);
5294
82.5M
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5295
82.5M
        }
5296
5297
167M
        switch (ch) {
5298
340k
        case 0:
5299
340k
            if (s == end || consumed)
5300
314k
                goto End;
5301
25.2k
            errmsg = "unexpected end of data";
5302
25.2k
            startinpos = s - starts;
5303
25.2k
            endinpos = end - starts;
5304
25.2k
            break;
5305
132M
        case 1:
5306
132M
            errmsg = "invalid start byte";
5307
132M
            startinpos = s - starts;
5308
132M
            endinpos = startinpos + 1;
5309
132M
            break;
5310
33.0M
        case 2:
5311
33.0M
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5312
33.0M
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5313
0
            {
5314
                /* Truncated surrogate code in range D800-DFFF */
5315
0
                goto End;
5316
0
            }
5317
33.0M
            _Py_FALLTHROUGH;
5318
34.1M
        case 3:
5319
34.2M
        case 4:
5320
34.2M
            errmsg = "invalid continuation byte";
5321
34.2M
            startinpos = s - starts;
5322
34.2M
            endinpos = startinpos + ch - 1;
5323
34.2M
            break;
5324
306k
        default:
5325
            // ch doesn't fit into kind, so change the buffer kind to write
5326
            // the character
5327
306k
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5328
0
                goto onError;
5329
306k
            continue;
5330
167M
        }
5331
5332
166M
        if (error_handler == _Py_ERROR_UNKNOWN)
5333
112k
            error_handler = _Py_GetErrorHandler(errors);
5334
5335
166M
        switch (error_handler) {
5336
0
        case _Py_ERROR_IGNORE:
5337
0
            s += (endinpos - startinpos);
5338
0
            break;
5339
5340
166M
        case _Py_ERROR_REPLACE:
5341
166M
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5342
0
                goto onError;
5343
166M
            s += (endinpos - startinpos);
5344
166M
            break;
5345
5346
3.00k
        case _Py_ERROR_SURROGATEESCAPE:
5347
3.00k
        {
5348
3.00k
            Py_ssize_t i;
5349
5350
3.00k
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5351
0
                goto onError;
5352
6.39k
            for (i=startinpos; i<endinpos; i++) {
5353
3.38k
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5354
3.38k
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5355
3.38k
                                ch + 0xdc00);
5356
3.38k
                writer->pos++;
5357
3.38k
            }
5358
3.00k
            s += (endinpos - startinpos);
5359
3.00k
            break;
5360
3.00k
        }
5361
5362
3.76k
        default:
5363
3.76k
            if (unicode_decode_call_errorhandler_writer(
5364
3.76k
                    errors, &error_handler_obj,
5365
3.76k
                    "utf-8", errmsg,
5366
3.76k
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5367
3.76k
                    writer)) {
5368
3.76k
                goto onError;
5369
3.76k
            }
5370
5371
0
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5372
0
                return -1;
5373
0
            }
5374
166M
        }
5375
166M
    }
5376
5377
415k
End:
5378
415k
    if (consumed)
5379
1.79k
        *consumed = s - starts;
5380
5381
415k
    Py_XDECREF(error_handler_obj);
5382
415k
    Py_XDECREF(exc);
5383
415k
    return 0;
5384
5385
3.76k
onError:
5386
3.76k
    Py_XDECREF(error_handler_obj);
5387
3.76k
    Py_XDECREF(exc);
5388
3.76k
    return -1;
5389
418k
}
5390
5391
5392
static PyObject *
5393
unicode_decode_utf8(const char *s, Py_ssize_t size,
5394
                    _Py_error_handler error_handler, const char *errors,
5395
                    Py_ssize_t *consumed)
5396
11.2M
{
5397
11.2M
    if (size == 0) {
5398
79.2k
        if (consumed) {
5399
0
            *consumed = 0;
5400
0
        }
5401
79.2k
        _Py_RETURN_UNICODE_EMPTY();
5402
79.2k
    }
5403
5404
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5405
11.1M
    if (size == 1 && (unsigned char)s[0] < 128) {
5406
1.69M
        if (consumed) {
5407
0
            *consumed = 1;
5408
0
        }
5409
1.69M
        return get_latin1_char((unsigned char)s[0]);
5410
1.69M
    }
5411
5412
    // I don't know this check is necessary or not. But there is a test
5413
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5414
9.43M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5415
0
        PyErr_NoMemory();
5416
0
        return NULL;
5417
0
    }
5418
5419
9.43M
    const char *starts = s;
5420
9.43M
    const char *end = s + size;
5421
5422
9.43M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5423
9.43M
    if (pos == size) {  // fast path: ASCII string.
5424
9.07M
        PyObject *u = PyUnicode_New(size, 127);
5425
9.07M
        if (u == NULL) {
5426
0
            return NULL;
5427
0
        }
5428
9.07M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5429
9.07M
        if (consumed) {
5430
0
            *consumed = size;
5431
0
        }
5432
9.07M
        return u;
5433
9.07M
    }
5434
5435
359k
    int maxchr = 127;
5436
359k
    Py_ssize_t maxsize = size;
5437
5438
359k
    unsigned char ch = (unsigned char)(s[pos]);
5439
    // error handler other than strict may remove/replace the invalid byte.
5440
    // consumed != NULL allows 1~3 bytes remainings.
5441
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5442
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5443
    // reallocation and copy.
5444
359k
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5445
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5446
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5447
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5448
        // means that it is no longer necessary to allocate several times the required amount
5449
        // of memory.
5450
97.8k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5451
97.8k
        if (ch < 0xc4) { // latin1
5452
15.4k
            maxchr = 0xff;
5453
15.4k
        }
5454
82.3k
        else if (ch < 0xf0) { // ucs2
5455
73.1k
            maxchr = 0xffff;
5456
73.1k
        }
5457
9.19k
        else { // ucs4
5458
9.19k
            maxchr = 0x10ffff;
5459
9.19k
        }
5460
97.8k
    }
5461
359k
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5462
359k
    if (!u) {
5463
0
        return NULL;
5464
0
    }
5465
5466
    // Use _PyUnicodeWriter after fast path is failed.
5467
359k
    _PyUnicodeWriter writer;
5468
359k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5469
359k
    if (maxchr <= 255) {
5470
276k
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5471
276k
        s += pos;
5472
276k
        size -= pos;
5473
276k
        writer.pos = pos;
5474
276k
    }
5475
5476
359k
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5477
359k
                                 error_handler, errors,
5478
359k
                                 consumed) < 0) {
5479
3.76k
        _PyUnicodeWriter_Dealloc(&writer);
5480
3.76k
        return NULL;
5481
3.76k
    }
5482
355k
    return _PyUnicodeWriter_Finish(&writer);
5483
359k
}
5484
5485
5486
// Used by PyUnicodeWriter_WriteUTF8() implementation
5487
static int
5488
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
5489
                           const char *s, Py_ssize_t size,
5490
                           _Py_error_handler error_handler, const char *errors,
5491
                           Py_ssize_t *consumed)
5492
5.39M
{
5493
5.39M
    if (size == 0) {
5494
7.30k
        if (consumed) {
5495
0
            *consumed = 0;
5496
0
        }
5497
7.30k
        return 0;
5498
7.30k
    }
5499
5500
    // fast path: try ASCII string.
5501
5.39M
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5502
0
        return -1;
5503
0
    }
5504
5505
5.39M
    const char *starts = s;
5506
5.39M
    const char *end = s + size;
5507
5.39M
    Py_ssize_t decoded = 0;
5508
5.39M
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5509
5.39M
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5510
5.38M
        decoded = ascii_decode(s, end, dest);
5511
5.38M
        writer->pos += decoded;
5512
5513
5.38M
        if (decoded == size) {
5514
5.33M
            if (consumed) {
5515
1.53k
                *consumed = size;
5516
1.53k
            }
5517
5.33M
            return 0;
5518
5.33M
        }
5519
57.3k
        s += decoded;
5520
57.3k
        size -= decoded;
5521
57.3k
    }
5522
5523
59.5k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5524
59.5k
                                    error_handler, errors, consumed);
5525
5.39M
}
5526
5527
5528
PyObject *
5529
PyUnicode_DecodeUTF8Stateful(const char *s,
5530
                             Py_ssize_t size,
5531
                             const char *errors,
5532
                             Py_ssize_t *consumed)
5533
11.2M
{
5534
11.2M
    return unicode_decode_utf8(s, size,
5535
11.2M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5536
11.2M
                               errors, consumed);
5537
11.2M
}
5538
5539
5540
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5541
   non-zero, use strict error handler otherwise.
5542
5543
   On success, write a pointer to a newly allocated wide character string into
5544
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5545
   (in number of wchar_t units) into *wlen (if wlen is set).
5546
5547
   On memory allocation failure, return -1.
5548
5549
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5550
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5551
   is not NULL, write the decoding error message into *reason. */
5552
int
5553
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5554
                 const char **reason, _Py_error_handler errors)
5555
5.23k
{
5556
5.23k
    const char *orig_s = s;
5557
5.23k
    const char *e;
5558
5.23k
    wchar_t *unicode;
5559
5.23k
    Py_ssize_t outpos;
5560
5561
5.23k
    int surrogateescape = 0;
5562
5.23k
    int surrogatepass = 0;
5563
5.23k
    switch (errors)
5564
5.23k
    {
5565
0
    case _Py_ERROR_STRICT:
5566
0
        break;
5567
5.23k
    case _Py_ERROR_SURROGATEESCAPE:
5568
5.23k
        surrogateescape = 1;
5569
5.23k
        break;
5570
0
    case _Py_ERROR_SURROGATEPASS:
5571
0
        surrogatepass = 1;
5572
0
        break;
5573
0
    default:
5574
0
        return -3;
5575
5.23k
    }
5576
5577
    /* Note: size will always be longer than the resulting Unicode
5578
       character count */
5579
5.23k
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5580
0
        return -1;
5581
0
    }
5582
5583
5.23k
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5584
5.23k
    if (!unicode) {
5585
0
        return -1;
5586
0
    }
5587
5588
    /* Unpack UTF-8 encoded data */
5589
5.23k
    e = s + size;
5590
5.23k
    outpos = 0;
5591
5.23k
    while (s < e) {
5592
5.23k
        Py_UCS4 ch;
5593
5.23k
#if SIZEOF_WCHAR_T == 4
5594
5.23k
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5595
#else
5596
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5597
#endif
5598
5.23k
        if (ch > 0xFF) {
5599
0
#if SIZEOF_WCHAR_T == 4
5600
0
            Py_UNREACHABLE();
5601
#else
5602
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5603
            /* write a surrogate pair */
5604
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5605
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5606
#endif
5607
0
        }
5608
5.23k
        else {
5609
5.23k
            if (!ch && s == e) {
5610
5.23k
                break;
5611
5.23k
            }
5612
5613
0
            if (surrogateescape) {
5614
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5615
0
            }
5616
0
            else {
5617
                /* Is it a valid three-byte code? */
5618
0
                if (surrogatepass
5619
0
                    && (e - s) >= 3
5620
0
                    && (s[0] & 0xf0) == 0xe0
5621
0
                    && (s[1] & 0xc0) == 0x80
5622
0
                    && (s[2] & 0xc0) == 0x80)
5623
0
                {
5624
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5625
0
                    s += 3;
5626
0
                    unicode[outpos++] = ch;
5627
0
                }
5628
0
                else {
5629
0
                    PyMem_RawFree(unicode );
5630
0
                    if (reason != NULL) {
5631
0
                        switch (ch) {
5632
0
                        case 0:
5633
0
                            *reason = "unexpected end of data";
5634
0
                            break;
5635
0
                        case 1:
5636
0
                            *reason = "invalid start byte";
5637
0
                            break;
5638
                        /* 2, 3, 4 */
5639
0
                        default:
5640
0
                            *reason = "invalid continuation byte";
5641
0
                            break;
5642
0
                        }
5643
0
                    }
5644
0
                    if (wlen != NULL) {
5645
0
                        *wlen = s - orig_s;
5646
0
                    }
5647
0
                    return -2;
5648
0
                }
5649
0
            }
5650
0
        }
5651
5.23k
    }
5652
5.23k
    unicode[outpos] = L'\0';
5653
5.23k
    if (wlen) {
5654
5.23k
        *wlen = outpos;
5655
5.23k
    }
5656
5.23k
    *wstr = unicode;
5657
5.23k
    return 0;
5658
5.23k
}
5659
5660
5661
wchar_t*
5662
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5663
                               size_t *wlen)
5664
0
{
5665
0
    wchar_t *wstr;
5666
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5667
0
                               &wstr, wlen,
5668
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5669
0
    if (res != 0) {
5670
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5671
0
        assert(res != -3);
5672
0
        if (wlen) {
5673
0
            *wlen = (size_t)res;
5674
0
        }
5675
0
        return NULL;
5676
0
    }
5677
0
    return wstr;
5678
0
}
5679
5680
5681
/* UTF-8 encoder.
5682
5683
   On success, return 0 and write the newly allocated character string (use
5684
   PyMem_Free() to free the memory) into *str.
5685
5686
   On encoding failure, return -2 and write the position of the invalid
5687
   surrogate character into *error_pos (if error_pos is set) and the decoding
5688
   error message into *reason (if reason is set).
5689
5690
   On memory allocation failure, return -1. */
5691
int
5692
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5693
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5694
638
{
5695
638
    const Py_ssize_t max_char_size = 4;
5696
638
    Py_ssize_t len = wcslen(text);
5697
5698
638
    assert(len >= 0);
5699
5700
638
    int surrogateescape = 0;
5701
638
    int surrogatepass = 0;
5702
638
    switch (errors)
5703
638
    {
5704
64
    case _Py_ERROR_STRICT:
5705
64
        break;
5706
574
    case _Py_ERROR_SURROGATEESCAPE:
5707
574
        surrogateescape = 1;
5708
574
        break;
5709
0
    case _Py_ERROR_SURROGATEPASS:
5710
0
        surrogatepass = 1;
5711
0
        break;
5712
0
    default:
5713
0
        return -3;
5714
638
    }
5715
5716
638
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5717
0
        return -1;
5718
0
    }
5719
638
    char *bytes;
5720
638
    if (raw_malloc) {
5721
638
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5722
638
    }
5723
0
    else {
5724
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5725
0
    }
5726
638
    if (bytes == NULL) {
5727
0
        return -1;
5728
0
    }
5729
5730
638
    char *p = bytes;
5731
638
    Py_ssize_t i;
5732
42.2k
    for (i = 0; i < len; ) {
5733
41.6k
        Py_ssize_t ch_pos = i;
5734
41.6k
        Py_UCS4 ch = text[i];
5735
41.6k
        i++;
5736
#if Py_UNICODE_SIZE == 2
5737
        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5738
            && i < len
5739
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5740
        {
5741
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5742
            i++;
5743
        }
5744
#endif
5745
5746
41.6k
        if (ch < 0x80) {
5747
            /* Encode ASCII */
5748
41.6k
            *p++ = (char) ch;
5749
5750
41.6k
        }
5751
0
        else if (ch < 0x0800) {
5752
            /* Encode Latin-1 */
5753
0
            *p++ = (char)(0xc0 | (ch >> 6));
5754
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5755
0
        }
5756
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5757
            /* surrogateescape error handler */
5758
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5759
0
                if (error_pos != NULL) {
5760
0
                    *error_pos = (size_t)ch_pos;
5761
0
                }
5762
0
                if (reason != NULL) {
5763
0
                    *reason = "encoding error";
5764
0
                }
5765
0
                if (raw_malloc) {
5766
0
                    PyMem_RawFree(bytes);
5767
0
                }
5768
0
                else {
5769
0
                    PyMem_Free(bytes);
5770
0
                }
5771
0
                return -2;
5772
0
            }
5773
0
            *p++ = (char)(ch & 0xff);
5774
0
        }
5775
0
        else if (ch < 0x10000) {
5776
0
            *p++ = (char)(0xe0 | (ch >> 12));
5777
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5778
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5779
0
        }
5780
0
        else {  /* ch >= 0x10000 */
5781
0
            assert(ch <= MAX_UNICODE);
5782
            /* Encode UCS4 Unicode ordinals */
5783
0
            *p++ = (char)(0xf0 | (ch >> 18));
5784
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5785
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5786
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5787
0
        }
5788
41.6k
    }
5789
638
    *p++ = '\0';
5790
5791
638
    size_t final_size = (p - bytes);
5792
638
    char *bytes2;
5793
638
    if (raw_malloc) {
5794
638
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5795
638
    }
5796
0
    else {
5797
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5798
0
    }
5799
638
    if (bytes2 == NULL) {
5800
0
        if (error_pos != NULL) {
5801
0
            *error_pos = (size_t)-1;
5802
0
        }
5803
0
        if (raw_malloc) {
5804
0
            PyMem_RawFree(bytes);
5805
0
        }
5806
0
        else {
5807
0
            PyMem_Free(bytes);
5808
0
        }
5809
0
        return -1;
5810
0
    }
5811
638
    *str = bytes2;
5812
638
    return 0;
5813
638
}
5814
5815
5816
/* Primary internal function which creates utf8 encoded bytes objects.
5817
5818
   Allocation strategy:  if the string is short, convert into a stack buffer
5819
   and allocate exactly as much space needed at the end.  Else allocate the
5820
   maximum possible needed (4 result bytes per Unicode character), and return
5821
   the excess memory at the end.
5822
*/
5823
static PyObject *
5824
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5825
                    const char *errors)
5826
15.1M
{
5827
15.1M
    if (!PyUnicode_Check(unicode)) {
5828
0
        PyErr_BadArgument();
5829
0
        return NULL;
5830
0
    }
5831
5832
15.1M
    if (PyUnicode_UTF8(unicode))
5833
8.75M
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5834
8.75M
                                         PyUnicode_UTF8_LENGTH(unicode));
5835
5836
6.39M
    int kind = PyUnicode_KIND(unicode);
5837
6.39M
    const void *data = PyUnicode_DATA(unicode);
5838
6.39M
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5839
5840
6.39M
    _PyBytesWriter writer;
5841
6.39M
    char *end;
5842
5843
6.39M
    switch (kind) {
5844
0
    default:
5845
0
        Py_UNREACHABLE();
5846
4.85M
    case PyUnicode_1BYTE_KIND:
5847
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5848
4.85M
        assert(!PyUnicode_IS_ASCII(unicode));
5849
4.85M
        end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5850
4.85M
        break;
5851
1.48M
    case PyUnicode_2BYTE_KIND:
5852
1.48M
        end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5853
1.48M
        break;
5854
61.8k
    case PyUnicode_4BYTE_KIND:
5855
61.8k
        end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5856
61.8k
        break;
5857
6.39M
    }
5858
5859
6.39M
    if (end == NULL) {
5860
148k
        _PyBytesWriter_Dealloc(&writer);
5861
148k
        return NULL;
5862
148k
    }
5863
6.25M
    return _PyBytesWriter_Finish(&writer, end);
5864
6.39M
}
5865
5866
static int
5867
unicode_fill_utf8(PyObject *unicode)
5868
162k
{
5869
162k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5870
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5871
162k
    assert(!PyUnicode_IS_ASCII(unicode));
5872
5873
162k
    int kind = PyUnicode_KIND(unicode);
5874
162k
    const void *data = PyUnicode_DATA(unicode);
5875
162k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5876
5877
162k
    _PyBytesWriter writer;
5878
162k
    char *end;
5879
5880
162k
    switch (kind) {
5881
0
    default:
5882
0
        Py_UNREACHABLE();
5883
134k
    case PyUnicode_1BYTE_KIND:
5884
134k
        end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5885
134k
                                   _Py_ERROR_STRICT, NULL);
5886
134k
        break;
5887
23.9k
    case PyUnicode_2BYTE_KIND:
5888
23.9k
        end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5889
23.9k
                                   _Py_ERROR_STRICT, NULL);
5890
23.9k
        break;
5891
4.69k
    case PyUnicode_4BYTE_KIND:
5892
4.69k
        end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5893
4.69k
                                   _Py_ERROR_STRICT, NULL);
5894
4.69k
        break;
5895
162k
    }
5896
162k
    if (end == NULL) {
5897
274
        _PyBytesWriter_Dealloc(&writer);
5898
274
        return -1;
5899
274
    }
5900
5901
162k
    const char *start = writer.use_small_buffer ? writer.small_buffer :
5902
162k
                    PyBytes_AS_STRING(writer.buffer);
5903
162k
    Py_ssize_t len = end - start;
5904
5905
162k
    char *cache = PyMem_Malloc(len + 1);
5906
162k
    if (cache == NULL) {
5907
0
        _PyBytesWriter_Dealloc(&writer);
5908
0
        PyErr_NoMemory();
5909
0
        return -1;
5910
0
    }
5911
162k
    memcpy(cache, start, len);
5912
162k
    cache[len] = '\0';
5913
162k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5914
162k
    PyUnicode_SET_UTF8(unicode, cache);
5915
162k
    _PyBytesWriter_Dealloc(&writer);
5916
162k
    return 0;
5917
162k
}
5918
5919
PyObject *
5920
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5921
15.1M
{
5922
15.1M
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5923
15.1M
}
5924
5925
5926
PyObject *
5927
PyUnicode_AsUTF8String(PyObject *unicode)
5928
2.87k
{
5929
2.87k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5930
2.87k
}
5931
5932
/* --- UTF-32 Codec ------------------------------------------------------- */
5933
5934
PyObject *
5935
PyUnicode_DecodeUTF32(const char *s,
5936
                      Py_ssize_t size,
5937
                      const char *errors,
5938
                      int *byteorder)
5939
63
{
5940
63
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5941
63
}
5942
5943
PyObject *
5944
PyUnicode_DecodeUTF32Stateful(const char *s,
5945
                              Py_ssize_t size,
5946
                              const char *errors,
5947
                              int *byteorder,
5948
                              Py_ssize_t *consumed)
5949
21.0k
{
5950
21.0k
    const char *starts = s;
5951
21.0k
    Py_ssize_t startinpos;
5952
21.0k
    Py_ssize_t endinpos;
5953
21.0k
    _PyUnicodeWriter writer;
5954
21.0k
    const unsigned char *q, *e;
5955
21.0k
    int le, bo = 0;       /* assume native ordering by default */
5956
21.0k
    const char *encoding;
5957
21.0k
    const char *errmsg = "";
5958
21.0k
    PyObject *errorHandler = NULL;
5959
21.0k
    PyObject *exc = NULL;
5960
5961
21.0k
    q = (const unsigned char *)s;
5962
21.0k
    e = q + size;
5963
5964
21.0k
    if (byteorder)
5965
20.9k
        bo = *byteorder;
5966
5967
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5968
       byte order setting accordingly. In native mode, the leading BOM
5969
       mark is skipped, in all other modes, it is copied to the output
5970
       stream as-is (giving a ZWNBSP character). */
5971
21.0k
    if (bo == 0 && size >= 4) {
5972
19.1k
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5973
19.1k
        if (bom == 0x0000FEFF) {
5974
202
            bo = -1;
5975
202
            q += 4;
5976
202
        }
5977
18.9k
        else if (bom == 0xFFFE0000) {
5978
207
            bo = 1;
5979
207
            q += 4;
5980
207
        }
5981
19.1k
        if (byteorder)
5982
19.1k
            *byteorder = bo;
5983
19.1k
    }
5984
5985
21.0k
    if (q == e) {
5986
75
        if (consumed)
5987
0
            *consumed = size;
5988
75
        _Py_RETURN_UNICODE_EMPTY();
5989
75
    }
5990
5991
#ifdef WORDS_BIGENDIAN
5992
    le = bo < 0;
5993
#else
5994
20.9k
    le = bo <= 0;
5995
20.9k
#endif
5996
20.9k
    encoding = le ? "utf-32-le" : "utf-32-be";
5997
5998
20.9k
    _PyUnicodeWriter_Init(&writer);
5999
20.9k
    writer.min_length = (e - q + 3) / 4;
6000
20.9k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6001
0
        goto onError;
6002
6003
79.3k
    while (1) {
6004
79.3k
        Py_UCS4 ch = 0;
6005
79.3k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
6006
6007
79.3k
        if (e - q >= 4) {
6008
65.9k
            int kind = writer.kind;
6009
65.9k
            void *data = writer.data;
6010
65.9k
            const unsigned char *last = e - 4;
6011
65.9k
            Py_ssize_t pos = writer.pos;
6012
65.9k
            if (le) {
6013
82.5k
                do {
6014
82.5k
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
6015
82.5k
                    if (ch > maxch)
6016
61.2k
                        break;
6017
21.3k
                    if (kind != PyUnicode_1BYTE_KIND &&
6018
21.3k
                        Py_UNICODE_IS_SURROGATE(ch))
6019
140
                        break;
6020
21.1k
                    PyUnicode_WRITE(kind, data, pos++, ch);
6021
21.1k
                    q += 4;
6022
21.1k
                } while (q <= last);
6023
62.3k
            }
6024
3.64k
            else {
6025
6.16k
                do {
6026
6.16k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
6027
6.16k
                    if (ch > maxch)
6028
3.41k
                        break;
6029
2.74k
                    if (kind != PyUnicode_1BYTE_KIND &&
6030
2.74k
                        Py_UNICODE_IS_SURROGATE(ch))
6031
102
                        break;
6032
2.64k
                    PyUnicode_WRITE(kind, data, pos++, ch);
6033
2.64k
                    q += 4;
6034
2.64k
                } while (q <= last);
6035
3.64k
            }
6036
0
            writer.pos = pos;
6037
65.9k
        }
6038
6039
79.3k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
6040
245
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
6041
245
            startinpos = ((const char *)q) - starts;
6042
245
            endinpos = startinpos + 4;
6043
245
        }
6044
79.1k
        else if (ch <= maxch) {
6045
14.4k
            if (q == e || consumed)
6046
2.71k
                break;
6047
            /* remaining bytes at the end? (size should be divisible by 4) */
6048
11.7k
            errmsg = "truncated data";
6049
11.7k
            startinpos = ((const char *)q) - starts;
6050
11.7k
            endinpos = ((const char *)e) - starts;
6051
11.7k
        }
6052
64.6k
        else {
6053
64.6k
            if (ch < 0x110000) {
6054
3.94k
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6055
0
                    goto onError;
6056
3.94k
                q += 4;
6057
3.94k
                continue;
6058
3.94k
            }
6059
60.7k
            errmsg = "code point not in range(0x110000)";
6060
60.7k
            startinpos = ((const char *)q) - starts;
6061
60.7k
            endinpos = startinpos + 4;
6062
60.7k
        }
6063
6064
        /* The remaining input chars are ignored if the callback
6065
           chooses to skip the input */
6066
72.6k
        if (unicode_decode_call_errorhandler_writer(
6067
72.6k
                errors, &errorHandler,
6068
72.6k
                encoding, errmsg,
6069
72.6k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
6070
72.6k
                &writer))
6071
18.2k
            goto onError;
6072
72.6k
    }
6073
6074
2.71k
    if (consumed)
6075
0
        *consumed = (const char *)q-starts;
6076
6077
2.71k
    Py_XDECREF(errorHandler);
6078
2.71k
    Py_XDECREF(exc);
6079
2.71k
    return _PyUnicodeWriter_Finish(&writer);
6080
6081
18.2k
  onError:
6082
18.2k
    _PyUnicodeWriter_Dealloc(&writer);
6083
18.2k
    Py_XDECREF(errorHandler);
6084
18.2k
    Py_XDECREF(exc);
6085
18.2k
    return NULL;
6086
20.9k
}
6087
6088
PyObject *
6089
_PyUnicode_EncodeUTF32(PyObject *str,
6090
                       const char *errors,
6091
                       int byteorder)
6092
0
{
6093
0
    int kind;
6094
0
    const void *data;
6095
0
    Py_ssize_t len;
6096
0
    PyObject *v;
6097
0
    uint32_t *out;
6098
0
#if PY_LITTLE_ENDIAN
6099
0
    int native_ordering = byteorder <= 0;
6100
#else
6101
    int native_ordering = byteorder >= 0;
6102
#endif
6103
0
    const char *encoding;
6104
0
    Py_ssize_t nsize, pos;
6105
0
    PyObject *errorHandler = NULL;
6106
0
    PyObject *exc = NULL;
6107
0
    PyObject *rep = NULL;
6108
6109
0
    if (!PyUnicode_Check(str)) {
6110
0
        PyErr_BadArgument();
6111
0
        return NULL;
6112
0
    }
6113
0
    kind = PyUnicode_KIND(str);
6114
0
    data = PyUnicode_DATA(str);
6115
0
    len = PyUnicode_GET_LENGTH(str);
6116
6117
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
6118
0
        return PyErr_NoMemory();
6119
0
    nsize = len + (byteorder == 0);
6120
0
    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
6121
0
    if (v == NULL)
6122
0
        return NULL;
6123
6124
    /* output buffer is 4-bytes aligned */
6125
0
    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
6126
0
    out = (uint32_t *)PyBytes_AS_STRING(v);
6127
0
    if (byteorder == 0)
6128
0
        *out++ = 0xFEFF;
6129
0
    if (len == 0)
6130
0
        goto done;
6131
6132
0
    if (byteorder == -1)
6133
0
        encoding = "utf-32-le";
6134
0
    else if (byteorder == 1)
6135
0
        encoding = "utf-32-be";
6136
0
    else
6137
0
        encoding = "utf-32";
6138
6139
0
    if (kind == PyUnicode_1BYTE_KIND) {
6140
0
        ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6141
0
        goto done;
6142
0
    }
6143
6144
0
    pos = 0;
6145
0
    while (pos < len) {
6146
0
        Py_ssize_t newpos, repsize, moreunits;
6147
6148
0
        if (kind == PyUnicode_2BYTE_KIND) {
6149
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6150
0
                                        &out, native_ordering);
6151
0
        }
6152
0
        else {
6153
0
            assert(kind == PyUnicode_4BYTE_KIND);
6154
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6155
0
                                        &out, native_ordering);
6156
0
        }
6157
0
        if (pos == len)
6158
0
            break;
6159
6160
0
        rep = unicode_encode_call_errorhandler(
6161
0
                errors, &errorHandler,
6162
0
                encoding, "surrogates not allowed",
6163
0
                str, &exc, pos, pos + 1, &newpos);
6164
0
        if (!rep)
6165
0
            goto error;
6166
6167
0
        if (PyBytes_Check(rep)) {
6168
0
            repsize = PyBytes_GET_SIZE(rep);
6169
0
            if (repsize & 3) {
6170
0
                raise_encode_exception(&exc, encoding,
6171
0
                                       str, pos, pos + 1,
6172
0
                                       "surrogates not allowed");
6173
0
                goto error;
6174
0
            }
6175
0
            moreunits = repsize / 4;
6176
0
        }
6177
0
        else {
6178
0
            assert(PyUnicode_Check(rep));
6179
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6180
0
            if (!PyUnicode_IS_ASCII(rep)) {
6181
0
                raise_encode_exception(&exc, encoding,
6182
0
                                       str, pos, pos + 1,
6183
0
                                       "surrogates not allowed");
6184
0
                goto error;
6185
0
            }
6186
0
        }
6187
0
        moreunits += pos - newpos;
6188
0
        pos = newpos;
6189
6190
        /* four bytes are reserved for each surrogate */
6191
0
        if (moreunits > 0) {
6192
0
            Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
6193
0
            if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
6194
                /* integer overflow */
6195
0
                PyErr_NoMemory();
6196
0
                goto error;
6197
0
            }
6198
0
            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
6199
0
                goto error;
6200
0
            out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
6201
0
        }
6202
6203
0
        if (PyBytes_Check(rep)) {
6204
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6205
0
            out += repsize / 4;
6206
0
        } else /* rep is unicode */ {
6207
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6208
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6209
0
                                 &out, native_ordering);
6210
0
        }
6211
6212
0
        Py_CLEAR(rep);
6213
0
    }
6214
6215
    /* Cut back to size actually needed. This is necessary for, for example,
6216
       encoding of a string containing isolated surrogates and the 'ignore'
6217
       handler is used. */
6218
0
    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6219
0
    if (nsize != PyBytes_GET_SIZE(v))
6220
0
      _PyBytes_Resize(&v, nsize);
6221
0
    Py_XDECREF(errorHandler);
6222
0
    Py_XDECREF(exc);
6223
0
  done:
6224
0
    return v;
6225
0
  error:
6226
0
    Py_XDECREF(rep);
6227
0
    Py_XDECREF(errorHandler);
6228
0
    Py_XDECREF(exc);
6229
0
    Py_XDECREF(v);
6230
0
    return NULL;
6231
0
}
6232
6233
PyObject *
6234
PyUnicode_AsUTF32String(PyObject *unicode)
6235
0
{
6236
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6237
0
}
6238
6239
/* --- UTF-16 Codec ------------------------------------------------------- */
6240
6241
PyObject *
6242
PyUnicode_DecodeUTF16(const char *s,
6243
                      Py_ssize_t size,
6244
                      const char *errors,
6245
                      int *byteorder)
6246
130
{
6247
130
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6248
130
}
6249
6250
PyObject *
6251
PyUnicode_DecodeUTF16Stateful(const char *s,
6252
                              Py_ssize_t size,
6253
                              const char *errors,
6254
                              int *byteorder,
6255
                              Py_ssize_t *consumed)
6256
19.2k
{
6257
19.2k
    const char *starts = s;
6258
19.2k
    Py_ssize_t startinpos;
6259
19.2k
    Py_ssize_t endinpos;
6260
19.2k
    _PyUnicodeWriter writer;
6261
19.2k
    const unsigned char *q, *e;
6262
19.2k
    int bo = 0;       /* assume native ordering by default */
6263
19.2k
    int native_ordering;
6264
19.2k
    const char *errmsg = "";
6265
19.2k
    PyObject *errorHandler = NULL;
6266
19.2k
    PyObject *exc = NULL;
6267
19.2k
    const char *encoding;
6268
6269
19.2k
    q = (const unsigned char *)s;
6270
19.2k
    e = q + size;
6271
6272
19.2k
    if (byteorder)
6273
19.0k
        bo = *byteorder;
6274
6275
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6276
       byte order setting accordingly. In native mode, the leading BOM
6277
       mark is skipped, in all other modes, it is copied to the output
6278
       stream as-is (giving a ZWNBSP character). */
6279
19.2k
    if (bo == 0 && size >= 2) {
6280
18.5k
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6281
18.5k
        if (bom == 0xFEFF) {
6282
380
            q += 2;
6283
380
            bo = -1;
6284
380
        }
6285
18.1k
        else if (bom == 0xFFFE) {
6286
1.71k
            q += 2;
6287
1.71k
            bo = 1;
6288
1.71k
        }
6289
18.5k
        if (byteorder)
6290
18.4k
            *byteorder = bo;
6291
18.5k
    }
6292
6293
19.2k
    if (q == e) {
6294
37
        if (consumed)
6295
0
            *consumed = size;
6296
37
        _Py_RETURN_UNICODE_EMPTY();
6297
37
    }
6298
6299
19.1k
#if PY_LITTLE_ENDIAN
6300
19.1k
    native_ordering = bo <= 0;
6301
19.1k
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6302
#else
6303
    native_ordering = bo >= 0;
6304
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6305
#endif
6306
6307
    /* Note: size will always be longer than the resulting Unicode
6308
       character count normally.  Error handler will take care of
6309
       resizing when needed. */
6310
19.1k
    _PyUnicodeWriter_Init(&writer);
6311
19.1k
    writer.min_length = (e - q + 1) / 2;
6312
19.1k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6313
0
        goto onError;
6314
6315
81.6k
    while (1) {
6316
81.6k
        Py_UCS4 ch = 0;
6317
81.6k
        if (e - q >= 2) {
6318
71.9k
            int kind = writer.kind;
6319
71.9k
            if (kind == PyUnicode_1BYTE_KIND) {
6320
21.4k
                if (PyUnicode_IS_ASCII(writer.buffer))
6321
18.6k
                    ch = asciilib_utf16_decode(&q, e,
6322
18.6k
                            (Py_UCS1*)writer.data, &writer.pos,
6323
18.6k
                            native_ordering);
6324
2.79k
                else
6325
2.79k
                    ch = ucs1lib_utf16_decode(&q, e,
6326
2.79k
                            (Py_UCS1*)writer.data, &writer.pos,
6327
2.79k
                            native_ordering);
6328
50.5k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6329
17.6k
                ch = ucs2lib_utf16_decode(&q, e,
6330
17.6k
                        (Py_UCS2*)writer.data, &writer.pos,
6331
17.6k
                        native_ordering);
6332
32.8k
            } else {
6333
32.8k
                assert(kind == PyUnicode_4BYTE_KIND);
6334
32.8k
                ch = ucs4lib_utf16_decode(&q, e,
6335
32.8k
                        (Py_UCS4*)writer.data, &writer.pos,
6336
32.8k
                        native_ordering);
6337
32.8k
            }
6338
71.9k
        }
6339
6340
81.6k
        switch (ch)
6341
81.6k
        {
6342
15.3k
        case 0:
6343
            /* remaining byte at the end? (size should be even) */
6344
15.3k
            if (q == e || consumed)
6345
10.8k
                goto End;
6346
4.54k
            errmsg = "truncated data";
6347
4.54k
            startinpos = ((const char *)q) - starts;
6348
4.54k
            endinpos = ((const char *)e) - starts;
6349
4.54k
            break;
6350
            /* The remaining input chars are ignored if the callback
6351
               chooses to skip the input */
6352
1.92k
        case 1:
6353
1.92k
            q -= 2;
6354
1.92k
            if (consumed)
6355
0
                goto End;
6356
1.92k
            errmsg = "unexpected end of data";
6357
1.92k
            startinpos = ((const char *)q) - starts;
6358
1.92k
            endinpos = ((const char *)e) - starts;
6359
1.92k
            break;
6360
26.8k
        case 2:
6361
26.8k
            errmsg = "illegal encoding";
6362
26.8k
            startinpos = ((const char *)q) - 2 - starts;
6363
26.8k
            endinpos = startinpos + 2;
6364
26.8k
            break;
6365
13.2k
        case 3:
6366
13.2k
            errmsg = "illegal UTF-16 surrogate";
6367
13.2k
            startinpos = ((const char *)q) - 4 - starts;
6368
13.2k
            endinpos = startinpos + 2;
6369
13.2k
            break;
6370
24.2k
        default:
6371
24.2k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6372
0
                goto onError;
6373
24.2k
            continue;
6374
81.6k
        }
6375
6376
46.5k
        if (unicode_decode_call_errorhandler_writer(
6377
46.5k
                errors,
6378
46.5k
                &errorHandler,
6379
46.5k
                encoding, errmsg,
6380
46.5k
                &starts,
6381
46.5k
                (const char **)&e,
6382
46.5k
                &startinpos,
6383
46.5k
                &endinpos,
6384
46.5k
                &exc,
6385
46.5k
                (const char **)&q,
6386
46.5k
                &writer))
6387
8.33k
            goto onError;
6388
46.5k
    }
6389
6390
10.8k
End:
6391
10.8k
    if (consumed)
6392
0
        *consumed = (const char *)q-starts;
6393
6394
10.8k
    Py_XDECREF(errorHandler);
6395
10.8k
    Py_XDECREF(exc);
6396
10.8k
    return _PyUnicodeWriter_Finish(&writer);
6397
6398
8.33k
  onError:
6399
8.33k
    _PyUnicodeWriter_Dealloc(&writer);
6400
8.33k
    Py_XDECREF(errorHandler);
6401
8.33k
    Py_XDECREF(exc);
6402
8.33k
    return NULL;
6403
19.1k
}
6404
6405
PyObject *
6406
_PyUnicode_EncodeUTF16(PyObject *str,
6407
                       const char *errors,
6408
                       int byteorder)
6409
0
{
6410
0
    int kind;
6411
0
    const void *data;
6412
0
    Py_ssize_t len;
6413
0
    PyObject *v;
6414
0
    unsigned short *out;
6415
0
    Py_ssize_t pairs;
6416
#if PY_BIG_ENDIAN
6417
    int native_ordering = byteorder >= 0;
6418
#else
6419
0
    int native_ordering = byteorder <= 0;
6420
0
#endif
6421
0
    const char *encoding;
6422
0
    Py_ssize_t nsize, pos;
6423
0
    PyObject *errorHandler = NULL;
6424
0
    PyObject *exc = NULL;
6425
0
    PyObject *rep = NULL;
6426
6427
0
    if (!PyUnicode_Check(str)) {
6428
0
        PyErr_BadArgument();
6429
0
        return NULL;
6430
0
    }
6431
0
    kind = PyUnicode_KIND(str);
6432
0
    data = PyUnicode_DATA(str);
6433
0
    len = PyUnicode_GET_LENGTH(str);
6434
6435
0
    pairs = 0;
6436
0
    if (kind == PyUnicode_4BYTE_KIND) {
6437
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6438
0
        const Py_UCS4 *end = in + len;
6439
0
        while (in < end) {
6440
0
            if (*in++ >= 0x10000) {
6441
0
                pairs++;
6442
0
            }
6443
0
        }
6444
0
    }
6445
0
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6446
0
        return PyErr_NoMemory();
6447
0
    }
6448
0
    nsize = len + pairs + (byteorder == 0);
6449
0
    v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6450
0
    if (v == NULL) {
6451
0
        return NULL;
6452
0
    }
6453
6454
    /* output buffer is 2-bytes aligned */
6455
0
    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6456
0
    out = (unsigned short *)PyBytes_AS_STRING(v);
6457
0
    if (byteorder == 0) {
6458
0
        *out++ = 0xFEFF;
6459
0
    }
6460
0
    if (len == 0) {
6461
0
        goto done;
6462
0
    }
6463
6464
0
    if (kind == PyUnicode_1BYTE_KIND) {
6465
0
        ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6466
0
        goto done;
6467
0
    }
6468
6469
0
    if (byteorder < 0) {
6470
0
        encoding = "utf-16-le";
6471
0
    }
6472
0
    else if (byteorder > 0) {
6473
0
        encoding = "utf-16-be";
6474
0
    }
6475
0
    else {
6476
0
        encoding = "utf-16";
6477
0
    }
6478
6479
0
    pos = 0;
6480
0
    while (pos < len) {
6481
0
        Py_ssize_t newpos, repsize, moreunits;
6482
6483
0
        if (kind == PyUnicode_2BYTE_KIND) {
6484
0
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6485
0
                                        &out, native_ordering);
6486
0
        }
6487
0
        else {
6488
0
            assert(kind == PyUnicode_4BYTE_KIND);
6489
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6490
0
                                        &out, native_ordering);
6491
0
        }
6492
0
        if (pos == len)
6493
0
            break;
6494
6495
0
        rep = unicode_encode_call_errorhandler(
6496
0
                errors, &errorHandler,
6497
0
                encoding, "surrogates not allowed",
6498
0
                str, &exc, pos, pos + 1, &newpos);
6499
0
        if (!rep)
6500
0
            goto error;
6501
6502
0
        if (PyBytes_Check(rep)) {
6503
0
            repsize = PyBytes_GET_SIZE(rep);
6504
0
            if (repsize & 1) {
6505
0
                raise_encode_exception(&exc, encoding,
6506
0
                                       str, pos, pos + 1,
6507
0
                                       "surrogates not allowed");
6508
0
                goto error;
6509
0
            }
6510
0
            moreunits = repsize / 2;
6511
0
        }
6512
0
        else {
6513
0
            assert(PyUnicode_Check(rep));
6514
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6515
0
            if (!PyUnicode_IS_ASCII(rep)) {
6516
0
                raise_encode_exception(&exc, encoding,
6517
0
                                       str, pos, pos + 1,
6518
0
                                       "surrogates not allowed");
6519
0
                goto error;
6520
0
            }
6521
0
        }
6522
0
        moreunits += pos - newpos;
6523
0
        pos = newpos;
6524
6525
        /* two bytes are reserved for each surrogate */
6526
0
        if (moreunits > 0) {
6527
0
            Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6528
0
            if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
6529
                /* integer overflow */
6530
0
                PyErr_NoMemory();
6531
0
                goto error;
6532
0
            }
6533
0
            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
6534
0
                goto error;
6535
0
            out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6536
0
        }
6537
6538
0
        if (PyBytes_Check(rep)) {
6539
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6540
0
            out += repsize / 2;
6541
0
        } else /* rep is unicode */ {
6542
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6543
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6544
0
                                 &out, native_ordering);
6545
0
        }
6546
6547
0
        Py_CLEAR(rep);
6548
0
    }
6549
6550
    /* Cut back to size actually needed. This is necessary for, for example,
6551
    encoding of a string containing isolated surrogates and the 'ignore' handler
6552
    is used. */
6553
0
    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6554
0
    if (nsize != PyBytes_GET_SIZE(v))
6555
0
      _PyBytes_Resize(&v, nsize);
6556
0
    Py_XDECREF(errorHandler);
6557
0
    Py_XDECREF(exc);
6558
0
  done:
6559
0
    return v;
6560
0
  error:
6561
0
    Py_XDECREF(rep);
6562
0
    Py_XDECREF(errorHandler);
6563
0
    Py_XDECREF(exc);
6564
0
    Py_XDECREF(v);
6565
0
    return NULL;
6566
0
#undef STORECHAR
6567
0
}
6568
6569
PyObject *
6570
PyUnicode_AsUTF16String(PyObject *unicode)
6571
0
{
6572
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6573
0
}
6574
6575
_PyUnicode_Name_CAPI *
6576
_PyUnicode_GetNameCAPI(void)
6577
2.54k
{
6578
2.54k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6579
2.54k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6580
6581
2.54k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6582
2.54k
    if (ucnhash_capi == NULL) {
6583
1
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6584
1
                PyUnicodeData_CAPSULE_NAME, 1);
6585
6586
        // It's fine if we overwrite the value here. It's always the same value.
6587
1
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6588
1
    }
6589
2.54k
    return ucnhash_capi;
6590
2.54k
}
6591
6592
/* --- Unicode Escape Codec ----------------------------------------------- */
6593
6594
PyObject *
6595
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6596
                               Py_ssize_t size,
6597
                               const char *errors,
6598
                               Py_ssize_t *consumed,
6599
                               int *first_invalid_escape_char,
6600
                               const char **first_invalid_escape_ptr)
6601
31.1k
{
6602
31.1k
    const char *starts = s;
6603
31.1k
    const char *initial_starts = starts;
6604
31.1k
    _PyUnicodeWriter writer;
6605
31.1k
    const char *end;
6606
31.1k
    PyObject *errorHandler = NULL;
6607
31.1k
    PyObject *exc = NULL;
6608
31.1k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6609
6610
    // so we can remember if we've seen an invalid escape char or not
6611
31.1k
    *first_invalid_escape_char = -1;
6612
31.1k
    *first_invalid_escape_ptr = NULL;
6613
6614
31.1k
    if (size == 0) {
6615
2.30k
        if (consumed) {
6616
0
            *consumed = 0;
6617
0
        }
6618
2.30k
        _Py_RETURN_UNICODE_EMPTY();
6619
2.30k
    }
6620
    /* Escaped strings will always be longer than the resulting
6621
       Unicode string, so we start with size here and then reduce the
6622
       length after conversion to the true value.
6623
       (but if the error callback returns a long replacement string
6624
       we'll have to allocate more space) */
6625
28.8k
    _PyUnicodeWriter_Init(&writer);
6626
28.8k
    writer.min_length = size;
6627
28.8k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6628
0
        goto onError;
6629
0
    }
6630
6631
28.8k
    end = s + size;
6632
213k
    while (s < end) {
6633
184k
        unsigned char c = (unsigned char) *s++;
6634
184k
        Py_UCS4 ch;
6635
184k
        int count;
6636
184k
        const char *message;
6637
6638
184k
#define WRITE_ASCII_CHAR(ch)                                                  \
6639
184k
            do {                                                              \
6640
14.8k
                assert(ch <= 127);                                            \
6641
14.8k
                assert(writer.pos < writer.size);                             \
6642
14.8k
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6643
14.8k
            } while(0)
6644
6645
184k
#define WRITE_CHAR(ch)                                                        \
6646
184k
            do {                                                              \
6647
173k
                if (ch <= writer.maxchar) {                                   \
6648
156k
                    assert(writer.pos < writer.size);                         \
6649
156k
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6650
156k
                }                                                             \
6651
173k
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6652
0
                    goto onError;                                             \
6653
0
                }                                                             \
6654
173k
            } while(0)
6655
6656
        /* Non-escape characters are interpreted as Unicode ordinals */
6657
184k
        if (c != '\\') {
6658
119k
            WRITE_CHAR(c);
6659
119k
            continue;
6660
119k
        }
6661
6662
64.6k
        Py_ssize_t startinpos = s - starts - 1;
6663
        /* \ - Escapes */
6664
64.6k
        if (s >= end) {
6665
0
            message = "\\ at end of string";
6666
0
            goto incomplete;
6667
0
        }
6668
64.6k
        c = (unsigned char) *s++;
6669
6670
64.6k
        assert(writer.pos < writer.size);
6671
64.6k
        switch (c) {
6672
6673
            /* \x escapes */
6674
943
        case '\n': continue;
6675
1.95k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6676
991
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6677
1.08k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6678
727
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6679
        /* FF */
6680
768
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6681
655
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6682
972
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6683
1.79k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6684
        /* VT */
6685
714
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6686
        /* BEL, not classic C */
6687
709
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6688
6689
            /* \OOO (octal) escapes */
6690
3.62k
        case '0': case '1': case '2': case '3':
6691
7.67k
        case '4': case '5': case '6': case '7':
6692
7.67k
            ch = c - '0';
6693
7.67k
            if (s < end && '0' <= *s && *s <= '7') {
6694
3.93k
                ch = (ch<<3) + *s++ - '0';
6695
3.93k
                if (s < end && '0' <= *s && *s <= '7') {
6696
1.89k
                    ch = (ch<<3) + *s++ - '0';
6697
1.89k
                }
6698
3.93k
            }
6699
7.67k
            if (ch > 0377) {
6700
1.54k
                if (*first_invalid_escape_char == -1) {
6701
1.03k
                    *first_invalid_escape_char = ch;
6702
1.03k
                    if (starts == initial_starts) {
6703
                        /* Back up 3 chars, since we've already incremented s. */
6704
1.03k
                        *first_invalid_escape_ptr = s - 3;
6705
1.03k
                    }
6706
1.03k
                }
6707
1.54k
            }
6708
7.67k
            WRITE_CHAR(ch);
6709
7.67k
            continue;
6710
6711
            /* hex escapes */
6712
            /* \xXX */
6713
7.67k
        case 'x':
6714
6.07k
            count = 2;
6715
6.07k
            message = "truncated \\xXX escape";
6716
6.07k
            goto hexescape;
6717
6718
            /* \uXXXX */
6719
9.10k
        case 'u':
6720
9.10k
            count = 4;
6721
9.10k
            message = "truncated \\uXXXX escape";
6722
9.10k
            goto hexescape;
6723
6724
            /* \UXXXXXXXX */
6725
23.4k
        case 'U':
6726
23.4k
            count = 8;
6727
23.4k
            message = "truncated \\UXXXXXXXX escape";
6728
38.6k
        hexescape:
6729
274k
            for (ch = 0; count; ++s, --count) {
6730
236k
                if (s >= end) {
6731
8
                    goto incomplete;
6732
8
                }
6733
236k
                c = (unsigned char)*s;
6734
236k
                ch <<= 4;
6735
236k
                if (c >= '0' && c <= '9') {
6736
170k
                    ch += c - '0';
6737
170k
                }
6738
65.8k
                else if (c >= 'a' && c <= 'f') {
6739
65.6k
                    ch += c - ('a' - 10);
6740
65.6k
                }
6741
227
                else if (c >= 'A' && c <= 'F') {
6742
223
                    ch += c - ('A' - 10);
6743
223
                }
6744
4
                else {
6745
4
                    goto error;
6746
4
                }
6747
236k
            }
6748
6749
            /* when we get here, ch is a 32-bit unicode character */
6750
38.6k
            if (ch > MAX_UNICODE) {
6751
1
                message = "illegal Unicode character";
6752
1
                goto error;
6753
1
            }
6754
6755
38.6k
            WRITE_CHAR(ch);
6756
38.6k
            continue;
6757
6758
            /* \N{name} */
6759
38.6k
        case 'N':
6760
2.54k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6761
2.54k
            if (ucnhash_capi == NULL) {
6762
0
                PyErr_SetString(
6763
0
                        PyExc_UnicodeError,
6764
0
                        "\\N escapes not supported (can't load unicodedata module)"
6765
0
                );
6766
0
                goto onError;
6767
0
            }
6768
6769
2.54k
            message = "malformed \\N character escape";
6770
2.54k
            if (s >= end) {
6771
4
                goto incomplete;
6772
4
            }
6773
2.54k
            if (*s == '{') {
6774
2.53k
                const char *start = ++s;
6775
2.53k
                size_t namelen;
6776
                /* look for the closing brace */
6777
38.5k
                while (s < end && *s != '}')
6778
35.9k
                    s++;
6779
2.53k
                if (s >= end) {
6780
9
                    goto incomplete;
6781
9
                }
6782
2.52k
                namelen = s - start;
6783
2.52k
                if (namelen) {
6784
                    /* found a name.  look it up in the unicode database */
6785
2.52k
                    s++;
6786
2.52k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6787
2.52k
                    if (namelen <= INT_MAX &&
6788
2.52k
                        ucnhash_capi->getcode(start, (int)namelen,
6789
2.52k
                                              &ch, 0)) {
6790
2.43k
                        assert(ch <= MAX_UNICODE);
6791
2.43k
                        WRITE_CHAR(ch);
6792
2.43k
                        continue;
6793
2.43k
                    }
6794
93
                    message = "unknown Unicode character name";
6795
93
                }
6796
2.52k
            }
6797
99
            goto error;
6798
6799
4.45k
        default:
6800
4.45k
            if (*first_invalid_escape_char == -1) {
6801
2.89k
                *first_invalid_escape_char = c;
6802
2.89k
                if (starts == initial_starts) {
6803
                    /* Back up one char, since we've already incremented s. */
6804
2.89k
                    *first_invalid_escape_ptr = s - 1;
6805
2.89k
                }
6806
2.89k
            }
6807
4.45k
            WRITE_ASCII_CHAR('\\');
6808
4.45k
            WRITE_CHAR(c);
6809
4.45k
            continue;
6810
64.6k
        }
6811
6812
21
      incomplete:
6813
21
        if (consumed) {
6814
0
            *consumed = startinpos;
6815
0
            break;
6816
0
        }
6817
125
      error:;
6818
125
        Py_ssize_t endinpos = s-starts;
6819
125
        writer.min_length = end - s + writer.pos;
6820
125
        if (unicode_decode_call_errorhandler_writer(
6821
125
                errors, &errorHandler,
6822
125
                "unicodeescape", message,
6823
125
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6824
125
                &writer)) {
6825
125
            goto onError;
6826
125
        }
6827
0
        assert(end - s <= writer.size - writer.pos);
6828
6829
0
#undef WRITE_ASCII_CHAR
6830
0
#undef WRITE_CHAR
6831
0
    }
6832
6833
28.7k
    Py_XDECREF(errorHandler);
6834
28.7k
    Py_XDECREF(exc);
6835
28.7k
    return _PyUnicodeWriter_Finish(&writer);
6836
6837
125
  onError:
6838
125
    _PyUnicodeWriter_Dealloc(&writer);
6839
125
    Py_XDECREF(errorHandler);
6840
125
    Py_XDECREF(exc);
6841
125
    return NULL;
6842
28.8k
}
6843
6844
PyObject *
6845
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6846
                              Py_ssize_t size,
6847
                              const char *errors,
6848
                              Py_ssize_t *consumed)
6849
0
{
6850
0
    int first_invalid_escape_char;
6851
0
    const char *first_invalid_escape_ptr;
6852
0
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6853
0
                                                      consumed,
6854
0
                                                      &first_invalid_escape_char,
6855
0
                                                      &first_invalid_escape_ptr);
6856
0
    if (result == NULL)
6857
0
        return NULL;
6858
0
    if (first_invalid_escape_char != -1) {
6859
0
        if (first_invalid_escape_char > 0xff) {
6860
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6861
0
                                 "\"\\%o\" is an invalid octal escape sequence. "
6862
0
                                 "Such sequences will not work in the future. ",
6863
0
                                 first_invalid_escape_char) < 0)
6864
0
            {
6865
0
                Py_DECREF(result);
6866
0
                return NULL;
6867
0
            }
6868
0
        }
6869
0
        else {
6870
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6871
0
                                 "\"\\%c\" is an invalid escape sequence. "
6872
0
                                 "Such sequences will not work in the future. ",
6873
0
                                 first_invalid_escape_char) < 0)
6874
0
            {
6875
0
                Py_DECREF(result);
6876
0
                return NULL;
6877
0
            }
6878
0
        }
6879
0
    }
6880
0
    return result;
6881
0
}
6882
6883
PyObject *
6884
PyUnicode_DecodeUnicodeEscape(const char *s,
6885
                              Py_ssize_t size,
6886
                              const char *errors)
6887
0
{
6888
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6889
0
}
6890
6891
/* Return a Unicode-Escape string version of the Unicode object. */
6892
6893
PyObject *
6894
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6895
610k
{
6896
610k
    Py_ssize_t i, len;
6897
610k
    PyObject *repr;
6898
610k
    char *p;
6899
610k
    int kind;
6900
610k
    const void *data;
6901
610k
    Py_ssize_t expandsize;
6902
6903
    /* Initial allocation is based on the longest-possible character
6904
       escape.
6905
6906
       For UCS1 strings it's '\xxx', 4 bytes per source character.
6907
       For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6908
       For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6909
    */
6910
6911
610k
    if (!PyUnicode_Check(unicode)) {
6912
0
        PyErr_BadArgument();
6913
0
        return NULL;
6914
0
    }
6915
6916
610k
    len = PyUnicode_GET_LENGTH(unicode);
6917
610k
    if (len == 0) {
6918
0
        return PyBytes_FromStringAndSize(NULL, 0);
6919
0
    }
6920
6921
610k
    kind = PyUnicode_KIND(unicode);
6922
610k
    data = PyUnicode_DATA(unicode);
6923
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6924
       bytes, and 1 byte characters 4. */
6925
610k
    expandsize = kind * 2 + 2;
6926
610k
    if (len > PY_SSIZE_T_MAX / expandsize) {
6927
0
        return PyErr_NoMemory();
6928
0
    }
6929
610k
    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6930
610k
    if (repr == NULL) {
6931
0
        return NULL;
6932
0
    }
6933
6934
610k
    p = PyBytes_AS_STRING(repr);
6935
1.22M
    for (i = 0; i < len; i++) {
6936
610k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6937
6938
        /* U+0000-U+00ff range */
6939
610k
        if (ch < 0x100) {
6940
604k
            if (ch >= ' ' && ch < 127) {
6941
61.6k
                if (ch != '\\') {
6942
                    /* Copy printable US ASCII as-is */
6943
0
                    *p++ = (char) ch;
6944
0
                }
6945
                /* Escape backslashes */
6946
61.6k
                else {
6947
61.6k
                    *p++ = '\\';
6948
61.6k
                    *p++ = '\\';
6949
61.6k
                }
6950
61.6k
            }
6951
6952
            /* Map special whitespace to '\t', \n', '\r' */
6953
543k
            else if (ch == '\t') {
6954
4.87k
                *p++ = '\\';
6955
4.87k
                *p++ = 't';
6956
4.87k
            }
6957
538k
            else if (ch == '\n') {
6958
1.96k
                *p++ = '\\';
6959
1.96k
                *p++ = 'n';
6960
1.96k
            }
6961
536k
            else if (ch == '\r') {
6962
517
                *p++ = '\\';
6963
517
                *p++ = 'r';
6964
517
            }
6965
6966
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6967
535k
            else {
6968
535k
                *p++ = '\\';
6969
535k
                *p++ = 'x';
6970
535k
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6971
535k
                *p++ = Py_hexdigits[ch & 0x000F];
6972
535k
            }
6973
604k
        }
6974
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6975
5.79k
        else if (ch < 0x10000) {
6976
4.73k
            *p++ = '\\';
6977
4.73k
            *p++ = 'u';
6978
4.73k
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6979
4.73k
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6980
4.73k
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6981
4.73k
            *p++ = Py_hexdigits[ch & 0x000F];
6982
4.73k
        }
6983
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6984
1.06k
        else {
6985
6986
            /* Make sure that the first two digits are zero */
6987
1.06k
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6988
1.06k
            *p++ = '\\';
6989
1.06k
            *p++ = 'U';
6990
1.06k
            *p++ = '0';
6991
1.06k
            *p++ = '0';
6992
1.06k
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6993
1.06k
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6994
1.06k
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6995
1.06k
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6996
1.06k
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6997
1.06k
            *p++ = Py_hexdigits[ch & 0x0000000F];
6998
1.06k
        }
6999
610k
    }
7000
7001
610k
    assert(p - PyBytes_AS_STRING(repr) > 0);
7002
610k
    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
7003
0
        return NULL;
7004
0
    }
7005
610k
    return repr;
7006
610k
}
7007
7008
/* --- Raw Unicode Escape Codec ------------------------------------------- */
7009
7010
PyObject *
7011
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
7012
                                          Py_ssize_t size,
7013
                                          const char *errors,
7014
                                          Py_ssize_t *consumed)
7015
0
{
7016
0
    const char *starts = s;
7017
0
    _PyUnicodeWriter writer;
7018
0
    const char *end;
7019
0
    PyObject *errorHandler = NULL;
7020
0
    PyObject *exc = NULL;
7021
7022
0
    if (size == 0) {
7023
0
        if (consumed) {
7024
0
            *consumed = 0;
7025
0
        }
7026
0
        _Py_RETURN_UNICODE_EMPTY();
7027
0
    }
7028
7029
    /* Escaped strings will always be longer than the resulting
7030
       Unicode string, so we start with size here and then reduce the
7031
       length after conversion to the true value. (But decoding error
7032
       handler might have to resize the string) */
7033
0
    _PyUnicodeWriter_Init(&writer);
7034
0
    writer.min_length = size;
7035
0
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
7036
0
        goto onError;
7037
0
    }
7038
7039
0
    end = s + size;
7040
0
    while (s < end) {
7041
0
        unsigned char c = (unsigned char) *s++;
7042
0
        Py_UCS4 ch;
7043
0
        int count;
7044
0
        const char *message;
7045
7046
0
#define WRITE_CHAR(ch)                                                        \
7047
0
            do {                                                              \
7048
0
                if (ch <= writer.maxchar) {                                   \
7049
0
                    assert(writer.pos < writer.size);                         \
7050
0
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
7051
0
                }                                                             \
7052
0
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
7053
0
                    goto onError;                                             \
7054
0
                }                                                             \
7055
0
            } while(0)
7056
7057
        /* Non-escape characters are interpreted as Unicode ordinals */
7058
0
        if (c != '\\' || (s >= end && !consumed)) {
7059
0
            WRITE_CHAR(c);
7060
0
            continue;
7061
0
        }
7062
7063
0
        Py_ssize_t startinpos = s - starts - 1;
7064
        /* \ - Escapes */
7065
0
        if (s >= end) {
7066
0
            assert(consumed);
7067
            // Set message to silent compiler warning.
7068
            // Actually it is never used.
7069
0
            message = "\\ at end of string";
7070
0
            goto incomplete;
7071
0
        }
7072
7073
0
        c = (unsigned char) *s++;
7074
0
        if (c == 'u') {
7075
0
            count = 4;
7076
0
            message = "truncated \\uXXXX escape";
7077
0
        }
7078
0
        else if (c == 'U') {
7079
0
            count = 8;
7080
0
            message = "truncated \\UXXXXXXXX escape";
7081
0
        }
7082
0
        else {
7083
0
            assert(writer.pos < writer.size);
7084
0
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
7085
0
            WRITE_CHAR(c);
7086
0
            continue;
7087
0
        }
7088
7089
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
7090
0
        for (ch = 0; count; ++s, --count) {
7091
0
            if (s >= end) {
7092
0
                goto incomplete;
7093
0
            }
7094
0
            c = (unsigned char)*s;
7095
0
            ch <<= 4;
7096
0
            if (c >= '0' && c <= '9') {
7097
0
                ch += c - '0';
7098
0
            }
7099
0
            else if (c >= 'a' && c <= 'f') {
7100
0
                ch += c - ('a' - 10);
7101
0
            }
7102
0
            else if (c >= 'A' && c <= 'F') {
7103
0
                ch += c - ('A' - 10);
7104
0
            }
7105
0
            else {
7106
0
                goto error;
7107
0
            }
7108
0
        }
7109
0
        if (ch > MAX_UNICODE) {
7110
0
            message = "\\Uxxxxxxxx out of range";
7111
0
            goto error;
7112
0
        }
7113
0
        WRITE_CHAR(ch);
7114
0
        continue;
7115
7116
0
      incomplete:
7117
0
        if (consumed) {
7118
0
            *consumed = startinpos;
7119
0
            break;
7120
0
        }
7121
0
      error:;
7122
0
        Py_ssize_t endinpos = s-starts;
7123
0
        writer.min_length = end - s + writer.pos;
7124
0
        if (unicode_decode_call_errorhandler_writer(
7125
0
                errors, &errorHandler,
7126
0
                "rawunicodeescape", message,
7127
0
                &starts, &end, &startinpos, &endinpos, &exc, &s,
7128
0
                &writer)) {
7129
0
            goto onError;
7130
0
        }
7131
0
        assert(end - s <= writer.size - writer.pos);
7132
7133
0
#undef WRITE_CHAR
7134
0
    }
7135
0
    Py_XDECREF(errorHandler);
7136
0
    Py_XDECREF(exc);
7137
0
    return _PyUnicodeWriter_Finish(&writer);
7138
7139
0
  onError:
7140
0
    _PyUnicodeWriter_Dealloc(&writer);
7141
0
    Py_XDECREF(errorHandler);
7142
0
    Py_XDECREF(exc);
7143
0
    return NULL;
7144
0
}
7145
7146
PyObject *
7147
PyUnicode_DecodeRawUnicodeEscape(const char *s,
7148
                                 Py_ssize_t size,
7149
                                 const char *errors)
7150
0
{
7151
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
7152
0
}
7153
7154
7155
PyObject *
7156
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7157
216k
{
7158
216k
    PyObject *repr;
7159
216k
    char *p;
7160
216k
    Py_ssize_t expandsize, pos;
7161
216k
    int kind;
7162
216k
    const void *data;
7163
216k
    Py_ssize_t len;
7164
7165
216k
    if (!PyUnicode_Check(unicode)) {
7166
0
        PyErr_BadArgument();
7167
0
        return NULL;
7168
0
    }
7169
216k
    kind = PyUnicode_KIND(unicode);
7170
216k
    data = PyUnicode_DATA(unicode);
7171
216k
    len = PyUnicode_GET_LENGTH(unicode);
7172
216k
    if (kind == PyUnicode_1BYTE_KIND) {
7173
216k
        return PyBytes_FromStringAndSize(data, len);
7174
216k
    }
7175
7176
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7177
       bytes, and 1 byte characters 4. */
7178
231
    expandsize = kind * 2 + 2;
7179
7180
231
    if (len > PY_SSIZE_T_MAX / expandsize) {
7181
0
        return PyErr_NoMemory();
7182
0
    }
7183
231
    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
7184
231
    if (repr == NULL) {
7185
0
        return NULL;
7186
0
    }
7187
231
    if (len == 0) {
7188
0
        return repr;
7189
0
    }
7190
7191
231
    p = PyBytes_AS_STRING(repr);
7192
5.34M
    for (pos = 0; pos < len; pos++) {
7193
5.34M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7194
7195
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7196
5.34M
        if (ch < 0x100) {
7197
5.29M
            *p++ = (char) ch;
7198
5.29M
        }
7199
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7200
51.8k
        else if (ch < 0x10000) {
7201
51.2k
            *p++ = '\\';
7202
51.2k
            *p++ = 'u';
7203
51.2k
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7204
51.2k
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7205
51.2k
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7206
51.2k
            *p++ = Py_hexdigits[ch & 15];
7207
51.2k
        }
7208
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7209
597
        else {
7210
597
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7211
597
            *p++ = '\\';
7212
597
            *p++ = 'U';
7213
597
            *p++ = '0';
7214
597
            *p++ = '0';
7215
597
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7216
597
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7217
597
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7218
597
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7219
597
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7220
597
            *p++ = Py_hexdigits[ch & 15];
7221
597
        }
7222
5.34M
    }
7223
7224
231
    assert(p > PyBytes_AS_STRING(repr));
7225
231
    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
7226
0
        return NULL;
7227
0
    }
7228
231
    return repr;
7229
231
}
7230
7231
/* --- Latin-1 Codec ------------------------------------------------------ */
7232
7233
PyObject *
7234
PyUnicode_DecodeLatin1(const char *s,
7235
                       Py_ssize_t size,
7236
                       const char *errors)
7237
2.55M
{
7238
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7239
2.55M
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7240
2.55M
}
7241
7242
/* create or adjust a UnicodeEncodeError */
7243
static void
7244
make_encode_exception(PyObject **exceptionObject,
7245
                      const char *encoding,
7246
                      PyObject *unicode,
7247
                      Py_ssize_t startpos, Py_ssize_t endpos,
7248
                      const char *reason)
7249
194k
{
7250
194k
    if (*exceptionObject == NULL) {
7251
194k
        *exceptionObject = PyObject_CallFunction(
7252
194k
            PyExc_UnicodeEncodeError, "sOnns",
7253
194k
            encoding, unicode, startpos, endpos, reason);
7254
194k
    }
7255
0
    else {
7256
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7257
0
            goto onError;
7258
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7259
0
            goto onError;
7260
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7261
0
            goto onError;
7262
0
        return;
7263
0
      onError:
7264
0
        Py_CLEAR(*exceptionObject);
7265
0
    }
7266
194k
}
7267
7268
/* raises a UnicodeEncodeError */
7269
static void
7270
raise_encode_exception(PyObject **exceptionObject,
7271
                       const char *encoding,
7272
                       PyObject *unicode,
7273
                       Py_ssize_t startpos, Py_ssize_t endpos,
7274
                       const char *reason)
7275
36.6k
{
7276
36.6k
    make_encode_exception(exceptionObject,
7277
36.6k
                          encoding, unicode, startpos, endpos, reason);
7278
36.6k
    if (*exceptionObject != NULL)
7279
36.6k
        PyCodec_StrictErrors(*exceptionObject);
7280
36.6k
}
7281
7282
/* error handling callback helper:
7283
   build arguments, call the callback and check the arguments,
7284
   put the result into newpos and return the replacement string, which
7285
   has to be freed by the caller */
7286
static PyObject *
7287
unicode_encode_call_errorhandler(const char *errors,
7288
                                 PyObject **errorHandler,
7289
                                 const char *encoding, const char *reason,
7290
                                 PyObject *unicode, PyObject **exceptionObject,
7291
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7292
                                 Py_ssize_t *newpos)
7293
158k
{
7294
158k
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7295
158k
    Py_ssize_t len;
7296
158k
    PyObject *restuple;
7297
158k
    PyObject *resunicode;
7298
7299
158k
    if (*errorHandler == NULL) {
7300
158k
        *errorHandler = PyCodec_LookupError(errors);
7301
158k
        if (*errorHandler == NULL)
7302
0
            return NULL;
7303
158k
    }
7304
7305
158k
    len = PyUnicode_GET_LENGTH(unicode);
7306
7307
158k
    make_encode_exception(exceptionObject,
7308
158k
                          encoding, unicode, startpos, endpos, reason);
7309
158k
    if (*exceptionObject == NULL)
7310
0
        return NULL;
7311
7312
158k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7313
158k
    if (restuple == NULL)
7314
158k
        return NULL;
7315
0
    if (!PyTuple_Check(restuple)) {
7316
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7317
0
        Py_DECREF(restuple);
7318
0
        return NULL;
7319
0
    }
7320
0
    if (!PyArg_ParseTuple(restuple, argparse,
7321
0
                          &resunicode, newpos)) {
7322
0
        Py_DECREF(restuple);
7323
0
        return NULL;
7324
0
    }
7325
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7326
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7327
0
        Py_DECREF(restuple);
7328
0
        return NULL;
7329
0
    }
7330
0
    if (*newpos<0)
7331
0
        *newpos = len + *newpos;
7332
0
    if (*newpos<0 || *newpos>len) {
7333
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7334
0
        Py_DECREF(restuple);
7335
0
        return NULL;
7336
0
    }
7337
0
    Py_INCREF(resunicode);
7338
0
    Py_DECREF(restuple);
7339
0
    return resunicode;
7340
0
}
7341
7342
static PyObject *
7343
unicode_encode_ucs1(PyObject *unicode,
7344
                    const char *errors,
7345
                    const Py_UCS4 limit)
7346
46.2k
{
7347
    /* input state */
7348
46.2k
    Py_ssize_t pos=0, size;
7349
46.2k
    int kind;
7350
46.2k
    const void *data;
7351
    /* pointer into the output */
7352
46.2k
    char *str;
7353
46.2k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7354
46.2k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7355
46.2k
    PyObject *error_handler_obj = NULL;
7356
46.2k
    PyObject *exc = NULL;
7357
46.2k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7358
46.2k
    PyObject *rep = NULL;
7359
    /* output object */
7360
46.2k
    _PyBytesWriter writer;
7361
7362
46.2k
    size = PyUnicode_GET_LENGTH(unicode);
7363
46.2k
    kind = PyUnicode_KIND(unicode);
7364
46.2k
    data = PyUnicode_DATA(unicode);
7365
    /* allocate enough for a simple encoding without
7366
       replacements, if we need more, we'll resize */
7367
46.2k
    if (size == 0)
7368
0
        return PyBytes_FromStringAndSize(NULL, 0);
7369
7370
46.2k
    _PyBytesWriter_Init(&writer);
7371
46.2k
    str = _PyBytesWriter_Alloc(&writer, size);
7372
46.2k
    if (str == NULL)
7373
0
        return NULL;
7374
7375
2.56M
    while (pos < size) {
7376
2.56M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7377
7378
        /* can we encode this? */
7379
2.56M
        if (ch < limit) {
7380
            /* no overflow check, because we know that the space is enough */
7381
2.51M
            *str++ = (char)ch;
7382
2.51M
            ++pos;
7383
2.51M
        }
7384
46.2k
        else {
7385
46.2k
            Py_ssize_t newpos, i;
7386
            /* startpos for collecting unencodable chars */
7387
46.2k
            Py_ssize_t collstart = pos;
7388
46.2k
            Py_ssize_t collend = collstart + 1;
7389
            /* find all unecodable characters */
7390
7391
411k
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7392
365k
                ++collend;
7393
7394
            /* Only overallocate the buffer if it's not the last write */
7395
46.2k
            writer.overallocate = (collend < size);
7396
7397
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7398
46.2k
            if (error_handler == _Py_ERROR_UNKNOWN)
7399
46.2k
                error_handler = _Py_GetErrorHandler(errors);
7400
7401
46.2k
            switch (error_handler) {
7402
36.6k
            case _Py_ERROR_STRICT:
7403
36.6k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7404
36.6k
                goto onError;
7405
7406
0
            case _Py_ERROR_REPLACE:
7407
0
                memset(str, '?', collend - collstart);
7408
0
                str += (collend - collstart);
7409
0
                _Py_FALLTHROUGH;
7410
0
            case _Py_ERROR_IGNORE:
7411
0
                pos = collend;
7412
0
                break;
7413
7414
0
            case _Py_ERROR_BACKSLASHREPLACE:
7415
                /* subtract preallocated bytes */
7416
0
                writer.min_size -= (collend - collstart);
7417
0
                str = backslashreplace(&writer, str,
7418
0
                                       unicode, collstart, collend);
7419
0
                if (str == NULL)
7420
0
                    goto onError;
7421
0
                pos = collend;
7422
0
                break;
7423
7424
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7425
                /* subtract preallocated bytes */
7426
0
                writer.min_size -= (collend - collstart);
7427
0
                str = xmlcharrefreplace(&writer, str,
7428
0
                                        unicode, collstart, collend);
7429
0
                if (str == NULL)
7430
0
                    goto onError;
7431
0
                pos = collend;
7432
0
                break;
7433
7434
9.55k
            case _Py_ERROR_SURROGATEESCAPE:
7435
9.55k
                for (i = collstart; i < collend; ++i) {
7436
9.55k
                    ch = PyUnicode_READ(kind, data, i);
7437
9.55k
                    if (ch < 0xdc80 || 0xdcff < ch) {
7438
                        /* Not a UTF-8b surrogate */
7439
9.55k
                        break;
7440
9.55k
                    }
7441
0
                    *str++ = (char)(ch - 0xdc00);
7442
0
                    ++pos;
7443
0
                }
7444
9.55k
                if (i >= collend)
7445
0
                    break;
7446
9.55k
                collstart = pos;
7447
9.55k
                assert(collstart != collend);
7448
9.55k
                _Py_FALLTHROUGH;
7449
7450
9.55k
            default:
7451
9.55k
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7452
9.55k
                                                       encoding, reason, unicode, &exc,
7453
9.55k
                                                       collstart, collend, &newpos);
7454
9.55k
                if (rep == NULL)
7455
9.55k
                    goto onError;
7456
7457
0
                if (newpos < collstart) {
7458
0
                    writer.overallocate = 1;
7459
0
                    str = _PyBytesWriter_Prepare(&writer, str,
7460
0
                                                 collstart - newpos);
7461
0
                    if (str == NULL)
7462
0
                        goto onError;
7463
0
                }
7464
0
                else {
7465
                    /* subtract preallocated bytes */
7466
0
                    writer.min_size -= newpos - collstart;
7467
                    /* Only overallocate the buffer if it's not the last write */
7468
0
                    writer.overallocate = (newpos < size);
7469
0
                }
7470
7471
0
                if (PyBytes_Check(rep)) {
7472
                    /* Directly copy bytes result to output. */
7473
0
                    str = _PyBytesWriter_WriteBytes(&writer, str,
7474
0
                                                    PyBytes_AS_STRING(rep),
7475
0
                                                    PyBytes_GET_SIZE(rep));
7476
0
                }
7477
0
                else {
7478
0
                    assert(PyUnicode_Check(rep));
7479
7480
0
                    if (limit == 256 ?
7481
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7482
0
                        !PyUnicode_IS_ASCII(rep))
7483
0
                    {
7484
                        /* Not all characters are smaller than limit */
7485
0
                        raise_encode_exception(&exc, encoding, unicode,
7486
0
                                               collstart, collend, reason);
7487
0
                        goto onError;
7488
0
                    }
7489
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7490
0
                    str = _PyBytesWriter_WriteBytes(&writer, str,
7491
0
                                                    PyUnicode_DATA(rep),
7492
0
                                                    PyUnicode_GET_LENGTH(rep));
7493
0
                }
7494
0
                if (str == NULL)
7495
0
                    goto onError;
7496
7497
0
                pos = newpos;
7498
0
                Py_CLEAR(rep);
7499
46.2k
            }
7500
7501
            /* If overallocation was disabled, ensure that it was the last
7502
               write. Otherwise, we missed an optimization */
7503
0
            assert(writer.overallocate || pos == size);
7504
0
        }
7505
2.56M
    }
7506
7507
0
    Py_XDECREF(error_handler_obj);
7508
0
    Py_XDECREF(exc);
7509
0
    return _PyBytesWriter_Finish(&writer, str);
7510
7511
46.2k
  onError:
7512
46.2k
    Py_XDECREF(rep);
7513
46.2k
    _PyBytesWriter_Dealloc(&writer);
7514
46.2k
    Py_XDECREF(error_handler_obj);
7515
46.2k
    Py_XDECREF(exc);
7516
46.2k
    return NULL;
7517
46.2k
}
7518
7519
PyObject *
7520
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7521
0
{
7522
0
    if (!PyUnicode_Check(unicode)) {
7523
0
        PyErr_BadArgument();
7524
0
        return NULL;
7525
0
    }
7526
    /* Fast path: if it is a one-byte string, construct
7527
       bytes object directly. */
7528
0
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7529
0
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7530
0
                                         PyUnicode_GET_LENGTH(unicode));
7531
    /* Non-Latin-1 characters present. Defer to above function to
7532
       raise the exception. */
7533
0
    return unicode_encode_ucs1(unicode, errors, 256);
7534
0
}
7535
7536
PyObject*
7537
PyUnicode_AsLatin1String(PyObject *unicode)
7538
0
{
7539
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7540
0
}
7541
7542
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7543
7544
PyObject *
7545
PyUnicode_DecodeASCII(const char *s,
7546
                      Py_ssize_t size,
7547
                      const char *errors)
7548
850k
{
7549
850k
    const char *starts = s;
7550
850k
    const char *e = s + size;
7551
850k
    PyObject *error_handler_obj = NULL;
7552
850k
    PyObject *exc = NULL;
7553
850k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7554
7555
850k
    if (size == 0)
7556
0
        _Py_RETURN_UNICODE_EMPTY();
7557
7558
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7559
850k
    if (size == 1 && (unsigned char)s[0] < 128) {
7560
6.03k
        return get_latin1_char((unsigned char)s[0]);
7561
6.03k
    }
7562
7563
    // Shortcut for simple case
7564
844k
    PyObject *u = PyUnicode_New(size, 127);
7565
844k
    if (u == NULL) {
7566
0
        return NULL;
7567
0
    }
7568
844k
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7569
844k
    if (outpos == size) {
7570
699k
        return u;
7571
699k
    }
7572
7573
145k
    _PyUnicodeWriter writer;
7574
145k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7575
145k
    writer.pos = outpos;
7576
7577
145k
    s += outpos;
7578
145k
    int kind = writer.kind;
7579
145k
    void *data = writer.data;
7580
145k
    Py_ssize_t startinpos, endinpos;
7581
7582
19.8M
    while (s < e) {
7583
19.7M
        unsigned char c = (unsigned char)*s;
7584
19.7M
        if (c < 128) {
7585
8.01M
            PyUnicode_WRITE(kind, data, writer.pos, c);
7586
8.01M
            writer.pos++;
7587
8.01M
            ++s;
7588
8.01M
            continue;
7589
8.01M
        }
7590
7591
        /* byte outsize range 0x00..0x7f: call the error handler */
7592
7593
11.7M
        if (error_handler == _Py_ERROR_UNKNOWN)
7594
145k
            error_handler = _Py_GetErrorHandler(errors);
7595
7596
11.7M
        switch (error_handler)
7597
11.7M
        {
7598
650k
        case _Py_ERROR_REPLACE:
7599
11.7M
        case _Py_ERROR_SURROGATEESCAPE:
7600
            /* Fast-path: the error handler only writes one character,
7601
               but we may switch to UCS2 at the first write */
7602
11.7M
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7603
0
                goto onError;
7604
11.7M
            kind = writer.kind;
7605
11.7M
            data = writer.data;
7606
7607
11.7M
            if (error_handler == _Py_ERROR_REPLACE)
7608
650k
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7609
11.0M
            else
7610
11.0M
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7611
11.7M
            writer.pos++;
7612
11.7M
            ++s;
7613
11.7M
            break;
7614
7615
0
        case _Py_ERROR_IGNORE:
7616
0
            ++s;
7617
0
            break;
7618
7619
5.55k
        default:
7620
5.55k
            startinpos = s-starts;
7621
5.55k
            endinpos = startinpos + 1;
7622
5.55k
            if (unicode_decode_call_errorhandler_writer(
7623
5.55k
                    errors, &error_handler_obj,
7624
5.55k
                    "ascii", "ordinal not in range(128)",
7625
5.55k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7626
5.55k
                    &writer))
7627
5.55k
                goto onError;
7628
0
            kind = writer.kind;
7629
0
            data = writer.data;
7630
11.7M
        }
7631
11.7M
    }
7632
139k
    Py_XDECREF(error_handler_obj);
7633
139k
    Py_XDECREF(exc);
7634
139k
    return _PyUnicodeWriter_Finish(&writer);
7635
7636
5.55k
  onError:
7637
5.55k
    _PyUnicodeWriter_Dealloc(&writer);
7638
5.55k
    Py_XDECREF(error_handler_obj);
7639
5.55k
    Py_XDECREF(exc);
7640
5.55k
    return NULL;
7641
145k
}
7642
7643
PyObject *
7644
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7645
104k
{
7646
104k
    if (!PyUnicode_Check(unicode)) {
7647
0
        PyErr_BadArgument();
7648
0
        return NULL;
7649
0
    }
7650
    /* Fast path: if it is an ASCII-only string, construct bytes object
7651
       directly. Else defer to above function to raise the exception. */
7652
104k
    if (PyUnicode_IS_ASCII(unicode))
7653
58.0k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7654
58.0k
                                         PyUnicode_GET_LENGTH(unicode));
7655
46.2k
    return unicode_encode_ucs1(unicode, errors, 128);
7656
104k
}
7657
7658
PyObject *
7659
PyUnicode_AsASCIIString(PyObject *unicode)
7660
4
{
7661
4
    return _PyUnicode_AsASCIIString(unicode, NULL);
7662
4
}
7663
7664
#ifdef MS_WINDOWS
7665
7666
/* --- MBCS codecs for Windows -------------------------------------------- */
7667
7668
#if SIZEOF_INT < SIZEOF_SIZE_T
7669
#define NEED_RETRY
7670
#endif
7671
7672
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7673
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7674
   both cases also and avoids partial characters overrunning the
7675
   length limit in MultiByteToWideChar on Windows */
7676
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7677
7678
#ifndef WC_ERR_INVALID_CHARS
7679
#  define WC_ERR_INVALID_CHARS 0x0080
7680
#endif
7681
7682
static const char*
7683
code_page_name(UINT code_page, PyObject **obj)
7684
{
7685
    *obj = NULL;
7686
    if (code_page == CP_ACP)
7687
        return "mbcs";
7688
    if (code_page == CP_UTF7)
7689
        return "CP_UTF7";
7690
    if (code_page == CP_UTF8)
7691
        return "CP_UTF8";
7692
7693
    *obj = PyBytes_FromFormat("cp%u", code_page);
7694
    if (*obj == NULL)
7695
        return NULL;
7696
    return PyBytes_AS_STRING(*obj);
7697
}
7698
7699
static DWORD
7700
decode_code_page_flags(UINT code_page)
7701
{
7702
    if (code_page == CP_UTF7) {
7703
        /* The CP_UTF7 decoder only supports flags=0 */
7704
        return 0;
7705
    }
7706
    else
7707
        return MB_ERR_INVALID_CHARS;
7708
}
7709
7710
/*
7711
 * Decode a byte string from a Windows code page into unicode object in strict
7712
 * mode.
7713
 *
7714
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7715
 * OSError and returns -1 on other error.
7716
 */
7717
static int
7718
decode_code_page_strict(UINT code_page,
7719
                        wchar_t **buf,
7720
                        Py_ssize_t *bufsize,
7721
                        const char *in,
7722
                        int insize)
7723
{
7724
    DWORD flags = MB_ERR_INVALID_CHARS;
7725
    wchar_t *out;
7726
    DWORD outsize;
7727
7728
    /* First get the size of the result */
7729
    assert(insize > 0);
7730
    while ((outsize = MultiByteToWideChar(code_page, flags,
7731
                                          in, insize, NULL, 0)) <= 0)
7732
    {
7733
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7734
            goto error;
7735
        }
7736
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7737
        flags = 0;
7738
    }
7739
7740
    /* Extend a wchar_t* buffer */
7741
    Py_ssize_t n = *bufsize;   /* Get the current length */
7742
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7743
        return -1;
7744
    }
7745
    out = *buf + n;
7746
7747
    /* Do the conversion */
7748
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7749
    if (outsize <= 0)
7750
        goto error;
7751
    return insize;
7752
7753
error:
7754
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7755
        return -2;
7756
    PyErr_SetFromWindowsErr(0);
7757
    return -1;
7758
}
7759
7760
/*
7761
 * Decode a byte string from a code page into unicode object with an error
7762
 * handler.
7763
 *
7764
 * Returns consumed size if succeed, or raise an OSError or
7765
 * UnicodeDecodeError exception and returns -1 on error.
7766
 */
7767
static int
7768
decode_code_page_errors(UINT code_page,
7769
                        wchar_t **buf,
7770
                        Py_ssize_t *bufsize,
7771
                        const char *in, const int size,
7772
                        const char *errors, int final)
7773
{
7774
    const char *startin = in;
7775
    const char *endin = in + size;
7776
    DWORD flags = MB_ERR_INVALID_CHARS;
7777
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7778
       2000 English version of the message. */
7779
    const char *reason = "No mapping for the Unicode character exists "
7780
                         "in the target code page.";
7781
    /* each step cannot decode more than 1 character, but a character can be
7782
       represented as a surrogate pair */
7783
    wchar_t buffer[2], *out;
7784
    int insize;
7785
    Py_ssize_t outsize;
7786
    PyObject *errorHandler = NULL;
7787
    PyObject *exc = NULL;
7788
    PyObject *encoding_obj = NULL;
7789
    const char *encoding;
7790
    DWORD err;
7791
    int ret = -1;
7792
7793
    assert(size > 0);
7794
7795
    encoding = code_page_name(code_page, &encoding_obj);
7796
    if (encoding == NULL)
7797
        return -1;
7798
7799
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7800
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7801
           UnicodeDecodeError. */
7802
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7803
        if (exc != NULL) {
7804
            PyCodec_StrictErrors(exc);
7805
            Py_CLEAR(exc);
7806
        }
7807
        goto error;
7808
    }
7809
7810
    /* Extend a wchar_t* buffer */
7811
    Py_ssize_t n = *bufsize;   /* Get the current length */
7812
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7813
        PyErr_NoMemory();
7814
        goto error;
7815
    }
7816
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7817
        goto error;
7818
    }
7819
    out = *buf + n;
7820
7821
    /* Decode the byte string character per character */
7822
    while (in < endin)
7823
    {
7824
        /* Decode a character */
7825
        insize = 1;
7826
        do
7827
        {
7828
            outsize = MultiByteToWideChar(code_page, flags,
7829
                                          in, insize,
7830
                                          buffer, Py_ARRAY_LENGTH(buffer));
7831
            if (outsize > 0)
7832
                break;
7833
            err = GetLastError();
7834
            if (err == ERROR_INVALID_FLAGS && flags) {
7835
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7836
                flags = 0;
7837
                continue;
7838
            }
7839
            if (err != ERROR_NO_UNICODE_TRANSLATION
7840
                && err != ERROR_INSUFFICIENT_BUFFER)
7841
            {
7842
                PyErr_SetFromWindowsErr(err);
7843
                goto error;
7844
            }
7845
            insize++;
7846
        }
7847
        /* 4=maximum length of a UTF-8 sequence */
7848
        while (insize <= 4 && (in + insize) <= endin);
7849
7850
        if (outsize <= 0) {
7851
            Py_ssize_t startinpos, endinpos, outpos;
7852
7853
            /* last character in partial decode? */
7854
            if (in + insize >= endin && !final)
7855
                break;
7856
7857
            startinpos = in - startin;
7858
            endinpos = startinpos + 1;
7859
            outpos = out - *buf;
7860
            if (unicode_decode_call_errorhandler_wchar(
7861
                    errors, &errorHandler,
7862
                    encoding, reason,
7863
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7864
                    buf, bufsize, &outpos))
7865
            {
7866
                goto error;
7867
            }
7868
            out = *buf + outpos;
7869
        }
7870
        else {
7871
            in += insize;
7872
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7873
            out += outsize;
7874
        }
7875
    }
7876
7877
    /* Shrink the buffer */
7878
    assert(out - *buf <= *bufsize);
7879
    *bufsize = out - *buf;
7880
    /* (in - startin) <= size and size is an int */
7881
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7882
7883
error:
7884
    Py_XDECREF(encoding_obj);
7885
    Py_XDECREF(errorHandler);
7886
    Py_XDECREF(exc);
7887
    return ret;
7888
}
7889
7890
static PyObject *
7891
decode_code_page_stateful(int code_page,
7892
                          const char *s, Py_ssize_t size,
7893
                          const char *errors, Py_ssize_t *consumed)
7894
{
7895
    wchar_t *buf = NULL;
7896
    Py_ssize_t bufsize = 0;
7897
    int chunk_size, final, converted, done;
7898
7899
    if (code_page < 0) {
7900
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7901
        return NULL;
7902
    }
7903
    if (size < 0) {
7904
        PyErr_BadInternalCall();
7905
        return NULL;
7906
    }
7907
7908
    if (consumed)
7909
        *consumed = 0;
7910
7911
    do
7912
    {
7913
#ifdef NEED_RETRY
7914
        if (size > DECODING_CHUNK_SIZE) {
7915
            chunk_size = DECODING_CHUNK_SIZE;
7916
            final = 0;
7917
            done = 0;
7918
        }
7919
        else
7920
#endif
7921
        {
7922
            chunk_size = (int)size;
7923
            final = (consumed == NULL);
7924
            done = 1;
7925
        }
7926
7927
        if (chunk_size == 0 && done) {
7928
            if (buf != NULL)
7929
                break;
7930
            _Py_RETURN_UNICODE_EMPTY();
7931
        }
7932
7933
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7934
                                            s, chunk_size);
7935
        if (converted == -2)
7936
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7937
                                                s, chunk_size,
7938
                                                errors, final);
7939
        assert(converted != 0 || done);
7940
7941
        if (converted < 0) {
7942
            PyMem_Free(buf);
7943
            return NULL;
7944
        }
7945
7946
        if (consumed)
7947
            *consumed += converted;
7948
7949
        s += converted;
7950
        size -= converted;
7951
    } while (!done);
7952
7953
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7954
    PyMem_Free(buf);
7955
    return v;
7956
}
7957
7958
PyObject *
7959
PyUnicode_DecodeCodePageStateful(int code_page,
7960
                                 const char *s,
7961
                                 Py_ssize_t size,
7962
                                 const char *errors,
7963
                                 Py_ssize_t *consumed)
7964
{
7965
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7966
}
7967
7968
PyObject *
7969
PyUnicode_DecodeMBCSStateful(const char *s,
7970
                             Py_ssize_t size,
7971
                             const char *errors,
7972
                             Py_ssize_t *consumed)
7973
{
7974
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7975
}
7976
7977
PyObject *
7978
PyUnicode_DecodeMBCS(const char *s,
7979
                     Py_ssize_t size,
7980
                     const char *errors)
7981
{
7982
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7983
}
7984
7985
static DWORD
7986
encode_code_page_flags(UINT code_page, const char *errors)
7987
{
7988
    if (code_page == CP_UTF8) {
7989
        return WC_ERR_INVALID_CHARS;
7990
    }
7991
    else if (code_page == CP_UTF7) {
7992
        /* CP_UTF7 only supports flags=0 */
7993
        return 0;
7994
    }
7995
    else {
7996
        if (errors != NULL && strcmp(errors, "replace") == 0)
7997
            return 0;
7998
        else
7999
            return WC_NO_BEST_FIT_CHARS;
8000
    }
8001
}
8002
8003
/*
8004
 * Encode a Unicode string to a Windows code page into a byte string in strict
8005
 * mode.
8006
 *
8007
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
8008
 * an OSError and returns -1 on other error.
8009
 */
8010
static int
8011
encode_code_page_strict(UINT code_page, PyObject **outbytes,
8012
                        PyObject *unicode, Py_ssize_t offset, int len,
8013
                        const char* errors)
8014
{
8015
    BOOL usedDefaultChar = FALSE;
8016
    BOOL *pusedDefaultChar = &usedDefaultChar;
8017
    int outsize;
8018
    wchar_t *p;
8019
    Py_ssize_t size;
8020
    const DWORD flags = encode_code_page_flags(code_page, NULL);
8021
    char *out;
8022
    /* Create a substring so that we can get the UTF-16 representation
8023
       of just the slice under consideration. */
8024
    PyObject *substring;
8025
    int ret = -1;
8026
8027
    assert(len > 0);
8028
8029
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
8030
        pusedDefaultChar = &usedDefaultChar;
8031
    else
8032
        pusedDefaultChar = NULL;
8033
8034
    substring = PyUnicode_Substring(unicode, offset, offset+len);
8035
    if (substring == NULL)
8036
        return -1;
8037
    p = PyUnicode_AsWideCharString(substring, &size);
8038
    Py_CLEAR(substring);
8039
    if (p == NULL) {
8040
        return -1;
8041
    }
8042
    assert(size <= INT_MAX);
8043
8044
    /* First get the size of the result */
8045
    outsize = WideCharToMultiByte(code_page, flags,
8046
                                  p, (int)size,
8047
                                  NULL, 0,
8048
                                  NULL, pusedDefaultChar);
8049
    if (outsize <= 0)
8050
        goto error;
8051
    /* If we used a default char, then we failed! */
8052
    if (pusedDefaultChar && *pusedDefaultChar) {
8053
        ret = -2;
8054
        goto done;
8055
    }
8056
8057
    if (*outbytes == NULL) {
8058
        /* Create string object */
8059
        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
8060
        if (*outbytes == NULL) {
8061
            goto done;
8062
        }
8063
        out = PyBytes_AS_STRING(*outbytes);
8064
    }
8065
    else {
8066
        /* Extend string object */
8067
        const Py_ssize_t n = PyBytes_Size(*outbytes);
8068
        if (outsize > PY_SSIZE_T_MAX - n) {
8069
            PyErr_NoMemory();
8070
            goto done;
8071
        }
8072
        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
8073
            goto done;
8074
        }
8075
        out = PyBytes_AS_STRING(*outbytes) + n;
8076
    }
8077
8078
    /* Do the conversion */
8079
    outsize = WideCharToMultiByte(code_page, flags,
8080
                                  p, (int)size,
8081
                                  out, outsize,
8082
                                  NULL, pusedDefaultChar);
8083
    if (outsize <= 0)
8084
        goto error;
8085
    if (pusedDefaultChar && *pusedDefaultChar) {
8086
        ret = -2;
8087
        goto done;
8088
    }
8089
    ret = 0;
8090
8091
done:
8092
    PyMem_Free(p);
8093
    return ret;
8094
8095
error:
8096
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
8097
        ret = -2;
8098
        goto done;
8099
    }
8100
    PyErr_SetFromWindowsErr(0);
8101
    goto done;
8102
}
8103
8104
/*
8105
 * Encode a Unicode string to a Windows code page into a byte string using an
8106
 * error handler.
8107
 *
8108
 * Returns consumed characters if succeed, or raise an OSError and returns
8109
 * -1 on other error.
8110
 */
8111
static int
8112
encode_code_page_errors(UINT code_page, PyObject **outbytes,
8113
                        PyObject *unicode, Py_ssize_t unicode_offset,
8114
                        Py_ssize_t insize, const char* errors)
8115
{
8116
    const DWORD flags = encode_code_page_flags(code_page, errors);
8117
    Py_ssize_t pos = unicode_offset;
8118
    Py_ssize_t endin = unicode_offset + insize;
8119
    /* Ideally, we should get reason from FormatMessage. This is the Windows
8120
       2000 English version of the message. */
8121
    const char *reason = "invalid character";
8122
    /* 4=maximum length of a UTF-8 sequence */
8123
    char buffer[4];
8124
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
8125
    Py_ssize_t outsize;
8126
    char *out;
8127
    PyObject *errorHandler = NULL;
8128
    PyObject *exc = NULL;
8129
    PyObject *encoding_obj = NULL;
8130
    const char *encoding;
8131
    Py_ssize_t newpos, newoutsize;
8132
    PyObject *rep;
8133
    int ret = -1;
8134
8135
    assert(insize > 0);
8136
8137
    encoding = code_page_name(code_page, &encoding_obj);
8138
    if (encoding == NULL)
8139
        return -1;
8140
8141
    if (errors == NULL || strcmp(errors, "strict") == 0) {
8142
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
8143
           then we raise a UnicodeEncodeError. */
8144
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
8145
        if (exc != NULL) {
8146
            PyCodec_StrictErrors(exc);
8147
            Py_DECREF(exc);
8148
        }
8149
        Py_XDECREF(encoding_obj);
8150
        return -1;
8151
    }
8152
8153
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
8154
        pusedDefaultChar = &usedDefaultChar;
8155
    else
8156
        pusedDefaultChar = NULL;
8157
8158
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
8159
        PyErr_NoMemory();
8160
        goto error;
8161
    }
8162
    outsize = insize * Py_ARRAY_LENGTH(buffer);
8163
8164
    if (*outbytes == NULL) {
8165
        /* Create string object */
8166
        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
8167
        if (*outbytes == NULL)
8168
            goto error;
8169
        out = PyBytes_AS_STRING(*outbytes);
8170
    }
8171
    else {
8172
        /* Extend string object */
8173
        Py_ssize_t n = PyBytes_Size(*outbytes);
8174
        if (n > PY_SSIZE_T_MAX - outsize) {
8175
            PyErr_NoMemory();
8176
            goto error;
8177
        }
8178
        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
8179
            goto error;
8180
        out = PyBytes_AS_STRING(*outbytes) + n;
8181
    }
8182
8183
    /* Encode the string character per character */
8184
    while (pos < endin)
8185
    {
8186
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8187
        wchar_t chars[2];
8188
        int charsize;
8189
        if (ch < 0x10000) {
8190
            chars[0] = (wchar_t)ch;
8191
            charsize = 1;
8192
        }
8193
        else {
8194
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8195
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8196
            charsize = 2;
8197
        }
8198
8199
        outsize = WideCharToMultiByte(code_page, flags,
8200
                                      chars, charsize,
8201
                                      buffer, Py_ARRAY_LENGTH(buffer),
8202
                                      NULL, pusedDefaultChar);
8203
        if (outsize > 0) {
8204
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8205
            {
8206
                pos++;
8207
                memcpy(out, buffer, outsize);
8208
                out += outsize;
8209
                continue;
8210
            }
8211
        }
8212
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8213
            PyErr_SetFromWindowsErr(0);
8214
            goto error;
8215
        }
8216
8217
        rep = unicode_encode_call_errorhandler(
8218
                  errors, &errorHandler, encoding, reason,
8219
                  unicode, &exc,
8220
                  pos, pos + 1, &newpos);
8221
        if (rep == NULL)
8222
            goto error;
8223
8224
        Py_ssize_t morebytes = pos - newpos;
8225
        if (PyBytes_Check(rep)) {
8226
            outsize = PyBytes_GET_SIZE(rep);
8227
            morebytes += outsize;
8228
            if (morebytes > 0) {
8229
                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8230
                newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
8231
                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8232
                    Py_DECREF(rep);
8233
                    goto error;
8234
                }
8235
                out = PyBytes_AS_STRING(*outbytes) + offset;
8236
            }
8237
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8238
            out += outsize;
8239
        }
8240
        else {
8241
            Py_ssize_t i;
8242
            int kind;
8243
            const void *data;
8244
8245
            outsize = PyUnicode_GET_LENGTH(rep);
8246
            morebytes += outsize;
8247
            if (morebytes > 0) {
8248
                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8249
                newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
8250
                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8251
                    Py_DECREF(rep);
8252
                    goto error;
8253
                }
8254
                out = PyBytes_AS_STRING(*outbytes) + offset;
8255
            }
8256
            kind = PyUnicode_KIND(rep);
8257
            data = PyUnicode_DATA(rep);
8258
            for (i=0; i < outsize; i++) {
8259
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8260
                if (ch > 127) {
8261
                    raise_encode_exception(&exc,
8262
                        encoding, unicode,
8263
                        pos, pos + 1,
8264
                        "unable to encode error handler result to ASCII");
8265
                    Py_DECREF(rep);
8266
                    goto error;
8267
                }
8268
                *out = (unsigned char)ch;
8269
                out++;
8270
            }
8271
        }
8272
        pos = newpos;
8273
        Py_DECREF(rep);
8274
    }
8275
    /* write a NUL byte */
8276
    *out = 0;
8277
    outsize = out - PyBytes_AS_STRING(*outbytes);
8278
    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8279
    if (_PyBytes_Resize(outbytes, outsize) < 0)
8280
        goto error;
8281
    ret = 0;
8282
8283
error:
8284
    Py_XDECREF(encoding_obj);
8285
    Py_XDECREF(errorHandler);
8286
    Py_XDECREF(exc);
8287
    return ret;
8288
}
8289
8290
static PyObject *
8291
encode_code_page(int code_page,
8292
                 PyObject *unicode,
8293
                 const char *errors)
8294
{
8295
    Py_ssize_t len;
8296
    PyObject *outbytes = NULL;
8297
    Py_ssize_t offset;
8298
    int chunk_len, ret, done;
8299
8300
    if (!PyUnicode_Check(unicode)) {
8301
        PyErr_BadArgument();
8302
        return NULL;
8303
    }
8304
8305
    len = PyUnicode_GET_LENGTH(unicode);
8306
8307
    if (code_page < 0) {
8308
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8309
        return NULL;
8310
    }
8311
8312
    if (len == 0)
8313
        return PyBytes_FromStringAndSize(NULL, 0);
8314
8315
    offset = 0;
8316
    do
8317
    {
8318
#ifdef NEED_RETRY
8319
        if (len > DECODING_CHUNK_SIZE) {
8320
            chunk_len = DECODING_CHUNK_SIZE;
8321
            done = 0;
8322
        }
8323
        else
8324
#endif
8325
        {
8326
            chunk_len = (int)len;
8327
            done = 1;
8328
        }
8329
8330
        ret = encode_code_page_strict(code_page, &outbytes,
8331
                                      unicode, offset, chunk_len,
8332
                                      errors);
8333
        if (ret == -2)
8334
            ret = encode_code_page_errors(code_page, &outbytes,
8335
                                          unicode, offset,
8336
                                          chunk_len, errors);
8337
        if (ret < 0) {
8338
            Py_XDECREF(outbytes);
8339
            return NULL;
8340
        }
8341
8342
        offset += chunk_len;
8343
        len -= chunk_len;
8344
    } while (!done);
8345
8346
    return outbytes;
8347
}
8348
8349
PyObject *
8350
PyUnicode_EncodeCodePage(int code_page,
8351
                         PyObject *unicode,
8352
                         const char *errors)
8353
{
8354
    return encode_code_page(code_page, unicode, errors);
8355
}
8356
8357
PyObject *
8358
PyUnicode_AsMBCSString(PyObject *unicode)
8359
{
8360
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8361
}
8362
8363
#undef NEED_RETRY
8364
8365
#endif /* MS_WINDOWS */
8366
8367
/* --- Character Mapping Codec -------------------------------------------- */
8368
8369
static int
8370
charmap_decode_string(const char *s,
8371
                      Py_ssize_t size,
8372
                      PyObject *mapping,
8373
                      const char *errors,
8374
                      _PyUnicodeWriter *writer)
8375
13.1k
{
8376
13.1k
    const char *starts = s;
8377
13.1k
    const char *e;
8378
13.1k
    Py_ssize_t startinpos, endinpos;
8379
13.1k
    PyObject *errorHandler = NULL, *exc = NULL;
8380
13.1k
    Py_ssize_t maplen;
8381
13.1k
    int mapkind;
8382
13.1k
    const void *mapdata;
8383
13.1k
    Py_UCS4 x;
8384
13.1k
    unsigned char ch;
8385
8386
13.1k
    maplen = PyUnicode_GET_LENGTH(mapping);
8387
13.1k
    mapdata = PyUnicode_DATA(mapping);
8388
13.1k
    mapkind = PyUnicode_KIND(mapping);
8389
8390
13.1k
    e = s + size;
8391
8392
13.1k
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8393
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8394
         * is disabled in encoding aliases, latin1 is preferred because
8395
         * its implementation is faster. */
8396
155
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8397
155
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8398
155
        Py_UCS4 maxchar = writer->maxchar;
8399
8400
155
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8401
67.8k
        while (s < e) {
8402
67.7k
            ch = *s;
8403
67.7k
            x = mapdata_ucs1[ch];
8404
67.7k
            if (x > maxchar) {
8405
145
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8406
0
                    goto onError;
8407
145
                maxchar = writer->maxchar;
8408
145
                outdata = (Py_UCS1 *)writer->data;
8409
145
            }
8410
67.7k
            outdata[writer->pos] = x;
8411
67.7k
            writer->pos++;
8412
67.7k
            ++s;
8413
67.7k
        }
8414
155
        return 0;
8415
155
    }
8416
8417
52.0k
    while (s < e) {
8418
45.5k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8419
45.5k
            int outkind = writer->kind;
8420
45.5k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8421
45.5k
            if (outkind == PyUnicode_1BYTE_KIND) {
8422
24.4k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8423
24.4k
                Py_UCS4 maxchar = writer->maxchar;
8424
92.5k
                while (s < e) {
8425
90.9k
                    ch = *s;
8426
90.9k
                    x = mapdata_ucs2[ch];
8427
90.9k
                    if (x > maxchar)
8428
22.8k
                        goto Error;
8429
68.1k
                    outdata[writer->pos] = x;
8430
68.1k
                    writer->pos++;
8431
68.1k
                    ++s;
8432
68.1k
                }
8433
1.59k
                break;
8434
24.4k
            }
8435
21.1k
            else if (outkind == PyUnicode_2BYTE_KIND) {
8436
21.1k
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8437
309k
                while (s < e) {
8438
305k
                    ch = *s;
8439
305k
                    x = mapdata_ucs2[ch];
8440
305k
                    if (x == 0xFFFE)
8441
16.2k
                        goto Error;
8442
288k
                    outdata[writer->pos] = x;
8443
288k
                    writer->pos++;
8444
288k
                    ++s;
8445
288k
                }
8446
4.81k
                break;
8447
21.1k
            }
8448
45.5k
        }
8449
0
        ch = *s;
8450
8451
0
        if (ch < maplen)
8452
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8453
0
        else
8454
0
            x = 0xfffe; /* invalid value */
8455
39.1k
Error:
8456
39.1k
        if (x == 0xfffe)
8457
25.9k
        {
8458
            /* undefined mapping */
8459
25.9k
            startinpos = s-starts;
8460
25.9k
            endinpos = startinpos+1;
8461
25.9k
            if (unicode_decode_call_errorhandler_writer(
8462
25.9k
                    errors, &errorHandler,
8463
25.9k
                    "charmap", "character maps to <undefined>",
8464
25.9k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8465
25.9k
                    writer)) {
8466
25
                goto onError;
8467
25
            }
8468
25.8k
            continue;
8469
25.9k
        }
8470
8471
13.2k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8472
0
            goto onError;
8473
13.2k
        ++s;
8474
13.2k
    }
8475
12.9k
    Py_XDECREF(errorHandler);
8476
12.9k
    Py_XDECREF(exc);
8477
12.9k
    return 0;
8478
8479
25
onError:
8480
25
    Py_XDECREF(errorHandler);
8481
25
    Py_XDECREF(exc);
8482
25
    return -1;
8483
12.9k
}
8484
8485
static int
8486
charmap_decode_mapping(const char *s,
8487
                       Py_ssize_t size,
8488
                       PyObject *mapping,
8489
                       const char *errors,
8490
                       _PyUnicodeWriter *writer)
8491
0
{
8492
0
    const char *starts = s;
8493
0
    const char *e;
8494
0
    Py_ssize_t startinpos, endinpos;
8495
0
    PyObject *errorHandler = NULL, *exc = NULL;
8496
0
    unsigned char ch;
8497
0
    PyObject *key, *item = NULL;
8498
8499
0
    e = s + size;
8500
8501
0
    while (s < e) {
8502
0
        ch = *s;
8503
8504
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8505
0
        key = PyLong_FromLong((long)ch);
8506
0
        if (key == NULL)
8507
0
            goto onError;
8508
8509
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8510
0
        Py_DECREF(key);
8511
0
        if (rc == 0) {
8512
            /* No mapping found means: mapping is undefined. */
8513
0
            goto Undefined;
8514
0
        }
8515
0
        if (item == NULL) {
8516
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8517
                /* No mapping found means: mapping is undefined. */
8518
0
                PyErr_Clear();
8519
0
                goto Undefined;
8520
0
            } else
8521
0
                goto onError;
8522
0
        }
8523
8524
        /* Apply mapping */
8525
0
        if (item == Py_None)
8526
0
            goto Undefined;
8527
0
        if (PyLong_Check(item)) {
8528
0
            long value = PyLong_AsLong(item);
8529
0
            if (value == 0xFFFE)
8530
0
                goto Undefined;
8531
0
            if (value < 0 || value > MAX_UNICODE) {
8532
0
                PyErr_Format(PyExc_TypeError,
8533
0
                             "character mapping must be in range(0x%x)",
8534
0
                             (unsigned long)MAX_UNICODE + 1);
8535
0
                goto onError;
8536
0
            }
8537
8538
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8539
0
                goto onError;
8540
0
        }
8541
0
        else if (PyUnicode_Check(item)) {
8542
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8543
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8544
0
                if (value == 0xFFFE)
8545
0
                    goto Undefined;
8546
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8547
0
                    goto onError;
8548
0
            }
8549
0
            else {
8550
0
                writer->overallocate = 1;
8551
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8552
0
                    goto onError;
8553
0
            }
8554
0
        }
8555
0
        else {
8556
            /* wrong return value */
8557
0
            PyErr_SetString(PyExc_TypeError,
8558
0
                            "character mapping must return integer, None or str");
8559
0
            goto onError;
8560
0
        }
8561
0
        Py_CLEAR(item);
8562
0
        ++s;
8563
0
        continue;
8564
8565
0
Undefined:
8566
        /* undefined mapping */
8567
0
        Py_CLEAR(item);
8568
0
        startinpos = s-starts;
8569
0
        endinpos = startinpos+1;
8570
0
        if (unicode_decode_call_errorhandler_writer(
8571
0
                errors, &errorHandler,
8572
0
                "charmap", "character maps to <undefined>",
8573
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8574
0
                writer)) {
8575
0
            goto onError;
8576
0
        }
8577
0
    }
8578
0
    Py_XDECREF(errorHandler);
8579
0
    Py_XDECREF(exc);
8580
0
    return 0;
8581
8582
0
onError:
8583
0
    Py_XDECREF(item);
8584
0
    Py_XDECREF(errorHandler);
8585
0
    Py_XDECREF(exc);
8586
0
    return -1;
8587
0
}
8588
8589
PyObject *
8590
PyUnicode_DecodeCharmap(const char *s,
8591
                        Py_ssize_t size,
8592
                        PyObject *mapping,
8593
                        const char *errors)
8594
13.1k
{
8595
13.1k
    _PyUnicodeWriter writer;
8596
8597
    /* Default to Latin-1 */
8598
13.1k
    if (mapping == NULL)
8599
0
        return PyUnicode_DecodeLatin1(s, size, errors);
8600
8601
13.1k
    if (size == 0)
8602
0
        _Py_RETURN_UNICODE_EMPTY();
8603
13.1k
    _PyUnicodeWriter_Init(&writer);
8604
13.1k
    writer.min_length = size;
8605
13.1k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8606
0
        goto onError;
8607
8608
13.1k
    if (PyUnicode_CheckExact(mapping)) {
8609
13.1k
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8610
25
            goto onError;
8611
13.1k
    }
8612
0
    else {
8613
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8614
0
            goto onError;
8615
0
    }
8616
13.0k
    return _PyUnicodeWriter_Finish(&writer);
8617
8618
25
  onError:
8619
25
    _PyUnicodeWriter_Dealloc(&writer);
8620
25
    return NULL;
8621
13.1k
}
8622
8623
/* Charmap encoding: the lookup table */
8624
8625
/*[clinic input]
8626
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8627
[clinic start generated code]*/
8628
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8629
8630
struct encoding_map {
8631
    PyObject_HEAD
8632
    unsigned char level1[32];
8633
    int count2, count3;
8634
    unsigned char level23[1];
8635
};
8636
8637
/*[clinic input]
8638
EncodingMap.size
8639
8640
Return the size (in bytes) of this object.
8641
[clinic start generated code]*/
8642
8643
static PyObject *
8644
EncodingMap_size_impl(struct encoding_map *self)
8645
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8646
0
{
8647
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8648
0
                           128*self->count3);
8649
0
}
8650
8651
static PyMethodDef encoding_map_methods[] = {
8652
    ENCODINGMAP_SIZE_METHODDEF
8653
    {NULL, NULL}
8654
};
8655
8656
static PyTypeObject EncodingMapType = {
8657
    PyVarObject_HEAD_INIT(NULL, 0)
8658
    .tp_name = "EncodingMap",
8659
    .tp_basicsize = sizeof(struct encoding_map),
8660
    /* methods */
8661
    .tp_flags = Py_TPFLAGS_DEFAULT,
8662
    .tp_methods = encoding_map_methods,
8663
};
8664
8665
PyObject*
8666
PyUnicode_BuildEncodingMap(PyObject* string)
8667
114
{
8668
114
    PyObject *result;
8669
114
    struct encoding_map *mresult;
8670
114
    int i;
8671
114
    int need_dict = 0;
8672
114
    unsigned char level1[32];
8673
114
    unsigned char level2[512];
8674
114
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8675
114
    int count2 = 0, count3 = 0;
8676
114
    int kind;
8677
114
    const void *data;
8678
114
    int length;
8679
114
    Py_UCS4 ch;
8680
8681
114
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8682
0
        PyErr_BadArgument();
8683
0
        return NULL;
8684
0
    }
8685
114
    kind = PyUnicode_KIND(string);
8686
114
    data = PyUnicode_DATA(string);
8687
114
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8688
114
    memset(level1, 0xFF, sizeof level1);
8689
114
    memset(level2, 0xFF, sizeof level2);
8690
8691
    /* If there isn't a one-to-one mapping of NULL to \0,
8692
       or if there are non-BMP characters, we need to use
8693
       a mapping dictionary. */
8694
114
    if (PyUnicode_READ(kind, data, 0) != 0)
8695
0
        need_dict = 1;
8696
29.1k
    for (i = 1; i < length; i++) {
8697
29.0k
        int l1, l2;
8698
29.0k
        ch = PyUnicode_READ(kind, data, i);
8699
29.0k
        if (ch == 0 || ch > 0xFFFF) {
8700
0
            need_dict = 1;
8701
0
            break;
8702
0
        }
8703
29.0k
        if (ch == 0xFFFE)
8704
            /* unmapped character */
8705
745
            continue;
8706
28.3k
        l1 = ch >> 11;
8707
28.3k
        l2 = ch >> 7;
8708
28.3k
        if (level1[l1] == 0xFF)
8709
207
            level1[l1] = count2++;
8710
28.3k
        if (level2[l2] == 0xFF)
8711
622
            level2[l2] = count3++;
8712
28.3k
    }
8713
8714
114
    if (count2 >= 0xFF || count3 >= 0xFF)
8715
0
        need_dict = 1;
8716
8717
114
    if (need_dict) {
8718
0
        PyObject *result = PyDict_New();
8719
0
        if (!result)
8720
0
            return NULL;
8721
0
        for (i = 0; i < length; i++) {
8722
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8723
0
            PyObject *key = PyLong_FromLong(c);
8724
0
            if (key == NULL) {
8725
0
                Py_DECREF(result);
8726
0
                return NULL;
8727
0
            }
8728
0
            PyObject *value = PyLong_FromLong(i);
8729
0
            if (value == NULL) {
8730
0
                Py_DECREF(key);
8731
0
                Py_DECREF(result);
8732
0
                return NULL;
8733
0
            }
8734
0
            int rc = PyDict_SetItem(result, key, value);
8735
0
            Py_DECREF(key);
8736
0
            Py_DECREF(value);
8737
0
            if (rc < 0) {
8738
0
                Py_DECREF(result);
8739
0
                return NULL;
8740
0
            }
8741
0
        }
8742
0
        return result;
8743
0
    }
8744
8745
    /* Create a three-level trie */
8746
114
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8747
114
                             16*count2 + 128*count3 - 1);
8748
114
    if (!result) {
8749
0
        return PyErr_NoMemory();
8750
0
    }
8751
8752
114
    _PyObject_Init(result, &EncodingMapType);
8753
114
    mresult = (struct encoding_map*)result;
8754
114
    mresult->count2 = count2;
8755
114
    mresult->count3 = count3;
8756
114
    mlevel1 = mresult->level1;
8757
114
    mlevel2 = mresult->level23;
8758
114
    mlevel3 = mresult->level23 + 16*count2;
8759
114
    memcpy(mlevel1, level1, 32);
8760
114
    memset(mlevel2, 0xFF, 16*count2);
8761
114
    memset(mlevel3, 0, 128*count3);
8762
114
    count3 = 0;
8763
29.1k
    for (i = 1; i < length; i++) {
8764
29.0k
        int o1, o2, o3, i2, i3;
8765
29.0k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8766
29.0k
        if (ch == 0xFFFE)
8767
            /* unmapped character */
8768
745
            continue;
8769
28.3k
        o1 = ch>>11;
8770
28.3k
        o2 = (ch>>7) & 0xF;
8771
28.3k
        i2 = 16*mlevel1[o1] + o2;
8772
28.3k
        if (mlevel2[i2] == 0xFF)
8773
622
            mlevel2[i2] = count3++;
8774
28.3k
        o3 = ch & 0x7F;
8775
28.3k
        i3 = 128*mlevel2[i2] + o3;
8776
28.3k
        mlevel3[i3] = i;
8777
28.3k
    }
8778
114
    return result;
8779
114
}
8780
8781
static int
8782
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8783
0
{
8784
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8785
0
    int l1 = c>>11;
8786
0
    int l2 = (c>>7) & 0xF;
8787
0
    int l3 = c & 0x7F;
8788
0
    int i;
8789
8790
0
    if (c > 0xFFFF)
8791
0
        return -1;
8792
0
    if (c == 0)
8793
0
        return 0;
8794
    /* level 1*/
8795
0
    i = map->level1[l1];
8796
0
    if (i == 0xFF) {
8797
0
        return -1;
8798
0
    }
8799
    /* level 2*/
8800
0
    i = map->level23[16*i+l2];
8801
0
    if (i == 0xFF) {
8802
0
        return -1;
8803
0
    }
8804
    /* level 3 */
8805
0
    i = map->level23[16*map->count2 + 128*i + l3];
8806
0
    if (i == 0) {
8807
0
        return -1;
8808
0
    }
8809
0
    return i;
8810
0
}
8811
8812
/* Lookup the character in the mapping.
8813
   On success, return PyLong, PyBytes or None (if the character can't be found).
8814
   If the result is PyLong, put its value in replace.
8815
   On error, return NULL.
8816
   */
8817
static PyObject *
8818
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8819
0
{
8820
0
    PyObject *w = PyLong_FromLong((long)c);
8821
0
    PyObject *x;
8822
8823
0
    if (w == NULL)
8824
0
        return NULL;
8825
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8826
0
    Py_DECREF(w);
8827
0
    if (rc == 0) {
8828
        /* No mapping found means: mapping is undefined. */
8829
0
        Py_RETURN_NONE;
8830
0
    }
8831
0
    if (x == NULL) {
8832
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8833
            /* No mapping found means: mapping is undefined. */
8834
0
            PyErr_Clear();
8835
0
            Py_RETURN_NONE;
8836
0
        } else
8837
0
            return NULL;
8838
0
    }
8839
0
    else if (x == Py_None)
8840
0
        return x;
8841
0
    else if (PyLong_Check(x)) {
8842
0
        long value = PyLong_AsLong(x);
8843
0
        if (value < 0 || value > 255) {
8844
0
            PyErr_SetString(PyExc_TypeError,
8845
0
                            "character mapping must be in range(256)");
8846
0
            Py_DECREF(x);
8847
0
            return NULL;
8848
0
        }
8849
0
        *replace = (unsigned char)value;
8850
0
        return x;
8851
0
    }
8852
0
    else if (PyBytes_Check(x))
8853
0
        return x;
8854
0
    else {
8855
        /* wrong return value */
8856
0
        PyErr_Format(PyExc_TypeError,
8857
0
                     "character mapping must return integer, bytes or None, not %.400s",
8858
0
                     Py_TYPE(x)->tp_name);
8859
0
        Py_DECREF(x);
8860
0
        return NULL;
8861
0
    }
8862
0
}
8863
8864
static int
8865
charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8866
0
{
8867
0
    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8868
    /* exponentially overallocate to minimize reallocations */
8869
0
    if (requiredsize < 2*outsize)
8870
0
        requiredsize = 2*outsize;
8871
0
    if (_PyBytes_Resize(outobj, requiredsize))
8872
0
        return -1;
8873
0
    return 0;
8874
0
}
8875
8876
typedef enum charmapencode_result {
8877
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8878
} charmapencode_result;
8879
/* lookup the character, put the result in the output string and adjust
8880
   various state variables. Resize the output bytes object if not enough
8881
   space is available. Return a new reference to the object that
8882
   was put in the output buffer, or Py_None, if the mapping was undefined
8883
   (in which case no character was written) or NULL, if a
8884
   reallocation error occurred. The caller must decref the result */
8885
static charmapencode_result
8886
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8887
                     PyObject **outobj, Py_ssize_t *outpos)
8888
0
{
8889
0
    PyObject *rep;
8890
0
    unsigned char replace;
8891
0
    char *outstart;
8892
0
    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8893
8894
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8895
0
        int res = encoding_map_lookup(c, mapping);
8896
0
        Py_ssize_t requiredsize = *outpos+1;
8897
0
        if (res == -1)
8898
0
            return enc_FAILED;
8899
0
        if (outsize<requiredsize)
8900
0
            if (charmapencode_resize(outobj, outpos, requiredsize))
8901
0
                return enc_EXCEPTION;
8902
0
        outstart = PyBytes_AS_STRING(*outobj);
8903
0
        outstart[(*outpos)++] = (char)res;
8904
0
        return enc_SUCCESS;
8905
0
    }
8906
8907
0
    rep = charmapencode_lookup(c, mapping, &replace);
8908
0
    if (rep==NULL)
8909
0
        return enc_EXCEPTION;
8910
0
    else if (rep==Py_None) {
8911
0
        Py_DECREF(rep);
8912
0
        return enc_FAILED;
8913
0
    } else {
8914
0
        if (PyLong_Check(rep)) {
8915
0
            Py_ssize_t requiredsize = *outpos+1;
8916
0
            if (outsize<requiredsize)
8917
0
                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8918
0
                    Py_DECREF(rep);
8919
0
                    return enc_EXCEPTION;
8920
0
                }
8921
0
            outstart = PyBytes_AS_STRING(*outobj);
8922
0
            outstart[(*outpos)++] = (char)replace;
8923
0
        }
8924
0
        else {
8925
0
            const char *repchars = PyBytes_AS_STRING(rep);
8926
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8927
0
            Py_ssize_t requiredsize = *outpos+repsize;
8928
0
            if (outsize<requiredsize)
8929
0
                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8930
0
                    Py_DECREF(rep);
8931
0
                    return enc_EXCEPTION;
8932
0
                }
8933
0
            outstart = PyBytes_AS_STRING(*outobj);
8934
0
            memcpy(outstart + *outpos, repchars, repsize);
8935
0
            *outpos += repsize;
8936
0
        }
8937
0
    }
8938
0
    Py_DECREF(rep);
8939
0
    return enc_SUCCESS;
8940
0
}
8941
8942
/* handle an error in PyUnicode_EncodeCharmap
8943
   Return 0 on success, -1 on error */
8944
static int
8945
charmap_encoding_error(
8946
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8947
    PyObject **exceptionObject,
8948
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8949
    PyObject **res, Py_ssize_t *respos)
8950
0
{
8951
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8952
0
    Py_ssize_t size, repsize;
8953
0
    Py_ssize_t newpos;
8954
0
    int kind;
8955
0
    const void *data;
8956
0
    Py_ssize_t index;
8957
    /* startpos for collecting unencodable chars */
8958
0
    Py_ssize_t collstartpos = *inpos;
8959
0
    Py_ssize_t collendpos = *inpos+1;
8960
0
    Py_ssize_t collpos;
8961
0
    const char *encoding = "charmap";
8962
0
    const char *reason = "character maps to <undefined>";
8963
0
    charmapencode_result x;
8964
0
    Py_UCS4 ch;
8965
0
    int val;
8966
8967
0
    size = PyUnicode_GET_LENGTH(unicode);
8968
    /* find all unencodable characters */
8969
0
    while (collendpos < size) {
8970
0
        PyObject *rep;
8971
0
        unsigned char replace;
8972
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8973
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8974
0
            val = encoding_map_lookup(ch, mapping);
8975
0
            if (val != -1)
8976
0
                break;
8977
0
            ++collendpos;
8978
0
            continue;
8979
0
        }
8980
8981
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8982
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8983
0
        if (rep==NULL)
8984
0
            return -1;
8985
0
        else if (rep!=Py_None) {
8986
0
            Py_DECREF(rep);
8987
0
            break;
8988
0
        }
8989
0
        Py_DECREF(rep);
8990
0
        ++collendpos;
8991
0
    }
8992
    /* cache callback name lookup
8993
     * (if not done yet, i.e. it's the first error) */
8994
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8995
0
        *error_handler = _Py_GetErrorHandler(errors);
8996
8997
0
    switch (*error_handler) {
8998
0
    case _Py_ERROR_STRICT:
8999
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
9000
0
        return -1;
9001
9002
0
    case _Py_ERROR_REPLACE:
9003
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
9004
0
            x = charmapencode_output('?', mapping, res, respos);
9005
0
            if (x==enc_EXCEPTION) {
9006
0
                return -1;
9007
0
            }
9008
0
            else if (x==enc_FAILED) {
9009
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
9010
0
                return -1;
9011
0
            }
9012
0
        }
9013
0
        _Py_FALLTHROUGH;
9014
0
    case _Py_ERROR_IGNORE:
9015
0
        *inpos = collendpos;
9016
0
        break;
9017
9018
0
    case _Py_ERROR_XMLCHARREFREPLACE:
9019
        /* generate replacement (temporarily (mis)uses p) */
9020
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
9021
0
            char buffer[2+29+1+1];
9022
0
            char *cp;
9023
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
9024
0
            for (cp = buffer; *cp; ++cp) {
9025
0
                x = charmapencode_output(*cp, mapping, res, respos);
9026
0
                if (x==enc_EXCEPTION)
9027
0
                    return -1;
9028
0
                else if (x==enc_FAILED) {
9029
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
9030
0
                    return -1;
9031
0
                }
9032
0
            }
9033
0
        }
9034
0
        *inpos = collendpos;
9035
0
        break;
9036
9037
0
    default:
9038
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
9039
0
                                                      encoding, reason, unicode, exceptionObject,
9040
0
                                                      collstartpos, collendpos, &newpos);
9041
0
        if (repunicode == NULL)
9042
0
            return -1;
9043
0
        if (PyBytes_Check(repunicode)) {
9044
            /* Directly copy bytes result to output. */
9045
0
            Py_ssize_t outsize = PyBytes_Size(*res);
9046
0
            Py_ssize_t requiredsize;
9047
0
            repsize = PyBytes_Size(repunicode);
9048
0
            requiredsize = *respos + repsize;
9049
0
            if (requiredsize > outsize)
9050
                /* Make room for all additional bytes. */
9051
0
                if (charmapencode_resize(res, respos, requiredsize)) {
9052
0
                    Py_DECREF(repunicode);
9053
0
                    return -1;
9054
0
                }
9055
0
            memcpy(PyBytes_AsString(*res) + *respos,
9056
0
                   PyBytes_AsString(repunicode),  repsize);
9057
0
            *respos += repsize;
9058
0
            *inpos = newpos;
9059
0
            Py_DECREF(repunicode);
9060
0
            break;
9061
0
        }
9062
        /* generate replacement  */
9063
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
9064
0
        data = PyUnicode_DATA(repunicode);
9065
0
        kind = PyUnicode_KIND(repunicode);
9066
0
        for (index = 0; index < repsize; index++) {
9067
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
9068
0
            x = charmapencode_output(repch, mapping, res, respos);
9069
0
            if (x==enc_EXCEPTION) {
9070
0
                Py_DECREF(repunicode);
9071
0
                return -1;
9072
0
            }
9073
0
            else if (x==enc_FAILED) {
9074
0
                Py_DECREF(repunicode);
9075
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
9076
0
                return -1;
9077
0
            }
9078
0
        }
9079
0
        *inpos = newpos;
9080
0
        Py_DECREF(repunicode);
9081
0
    }
9082
0
    return 0;
9083
0
}
9084
9085
PyObject *
9086
_PyUnicode_EncodeCharmap(PyObject *unicode,
9087
                         PyObject *mapping,
9088
                         const char *errors)
9089
0
{
9090
    /* output object */
9091
0
    PyObject *res = NULL;
9092
    /* current input position */
9093
0
    Py_ssize_t inpos = 0;
9094
0
    Py_ssize_t size;
9095
    /* current output position */
9096
0
    Py_ssize_t respos = 0;
9097
0
    PyObject *error_handler_obj = NULL;
9098
0
    PyObject *exc = NULL;
9099
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
9100
0
    const void *data;
9101
0
    int kind;
9102
9103
0
    size = PyUnicode_GET_LENGTH(unicode);
9104
0
    data = PyUnicode_DATA(unicode);
9105
0
    kind = PyUnicode_KIND(unicode);
9106
9107
    /* Default to Latin-1 */
9108
0
    if (mapping == NULL)
9109
0
        return unicode_encode_ucs1(unicode, errors, 256);
9110
9111
    /* allocate enough for a simple encoding without
9112
       replacements, if we need more, we'll resize */
9113
0
    res = PyBytes_FromStringAndSize(NULL, size);
9114
0
    if (res == NULL)
9115
0
        goto onError;
9116
0
    if (size == 0)
9117
0
        return res;
9118
9119
0
    while (inpos<size) {
9120
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9121
        /* try to encode it */
9122
0
        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
9123
0
        if (x==enc_EXCEPTION) /* error */
9124
0
            goto onError;
9125
0
        if (x==enc_FAILED) { /* unencodable character */
9126
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
9127
0
                                       &exc,
9128
0
                                       &error_handler, &error_handler_obj, errors,
9129
0
                                       &res, &respos)) {
9130
0
                goto onError;
9131
0
            }
9132
0
        }
9133
0
        else
9134
            /* done with this character => adjust input position */
9135
0
            ++inpos;
9136
0
    }
9137
9138
    /* Resize if we allocated to much */
9139
0
    if (respos<PyBytes_GET_SIZE(res))
9140
0
        if (_PyBytes_Resize(&res, respos) < 0)
9141
0
            goto onError;
9142
9143
0
    Py_XDECREF(exc);
9144
0
    Py_XDECREF(error_handler_obj);
9145
0
    return res;
9146
9147
0
  onError:
9148
0
    Py_XDECREF(res);
9149
0
    Py_XDECREF(exc);
9150
0
    Py_XDECREF(error_handler_obj);
9151
0
    return NULL;
9152
0
}
9153
9154
PyObject *
9155
PyUnicode_AsCharmapString(PyObject *unicode,
9156
                          PyObject *mapping)
9157
0
{
9158
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9159
0
        PyErr_BadArgument();
9160
0
        return NULL;
9161
0
    }
9162
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9163
0
}
9164
9165
/* create or adjust a UnicodeTranslateError */
9166
static void
9167
make_translate_exception(PyObject **exceptionObject,
9168
                         PyObject *unicode,
9169
                         Py_ssize_t startpos, Py_ssize_t endpos,
9170
                         const char *reason)
9171
0
{
9172
0
    if (*exceptionObject == NULL) {
9173
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9174
0
            unicode, startpos, endpos, reason);
9175
0
    }
9176
0
    else {
9177
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9178
0
            goto onError;
9179
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9180
0
            goto onError;
9181
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9182
0
            goto onError;
9183
0
        return;
9184
0
      onError:
9185
0
        Py_CLEAR(*exceptionObject);
9186
0
    }
9187
0
}
9188
9189
/* error handling callback helper:
9190
   build arguments, call the callback and check the arguments,
9191
   put the result into newpos and return the replacement string, which
9192
   has to be freed by the caller */
9193
static PyObject *
9194
unicode_translate_call_errorhandler(const char *errors,
9195
                                    PyObject **errorHandler,
9196
                                    const char *reason,
9197
                                    PyObject *unicode, PyObject **exceptionObject,
9198
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9199
                                    Py_ssize_t *newpos)
9200
0
{
9201
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9202
9203
0
    Py_ssize_t i_newpos;
9204
0
    PyObject *restuple;
9205
0
    PyObject *resunicode;
9206
9207
0
    if (*errorHandler == NULL) {
9208
0
        *errorHandler = PyCodec_LookupError(errors);
9209
0
        if (*errorHandler == NULL)
9210
0
            return NULL;
9211
0
    }
9212
9213
0
    make_translate_exception(exceptionObject,
9214
0
                             unicode, startpos, endpos, reason);
9215
0
    if (*exceptionObject == NULL)
9216
0
        return NULL;
9217
9218
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9219
0
    if (restuple == NULL)
9220
0
        return NULL;
9221
0
    if (!PyTuple_Check(restuple)) {
9222
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9223
0
        Py_DECREF(restuple);
9224
0
        return NULL;
9225
0
    }
9226
0
    if (!PyArg_ParseTuple(restuple, argparse,
9227
0
                          &resunicode, &i_newpos)) {
9228
0
        Py_DECREF(restuple);
9229
0
        return NULL;
9230
0
    }
9231
0
    if (i_newpos<0)
9232
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9233
0
    else
9234
0
        *newpos = i_newpos;
9235
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9236
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9237
0
        Py_DECREF(restuple);
9238
0
        return NULL;
9239
0
    }
9240
0
    Py_INCREF(resunicode);
9241
0
    Py_DECREF(restuple);
9242
0
    return resunicode;
9243
0
}
9244
9245
/* Lookup the character ch in the mapping and put the result in result,
9246
   which must be decrefed by the caller.
9247
   The result can be PyLong, PyUnicode, None or NULL.
9248
   If the result is PyLong, put its value in replace.
9249
   Return 0 on success, -1 on error */
9250
static int
9251
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9252
150
{
9253
150
    PyObject *w = PyLong_FromLong((long)c);
9254
150
    PyObject *x;
9255
9256
150
    if (w == NULL)
9257
0
        return -1;
9258
150
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9259
150
    Py_DECREF(w);
9260
150
    if (rc == 0) {
9261
        /* No mapping found means: use 1:1 mapping. */
9262
74
        *result = NULL;
9263
74
        return 0;
9264
74
    }
9265
76
    if (x == NULL) {
9266
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9267
            /* No mapping found means: use 1:1 mapping. */
9268
0
            PyErr_Clear();
9269
0
            *result = NULL;
9270
0
            return 0;
9271
0
        } else
9272
0
            return -1;
9273
0
    }
9274
76
    else if (x == Py_None) {
9275
0
        *result = x;
9276
0
        return 0;
9277
0
    }
9278
76
    else if (PyLong_Check(x)) {
9279
0
        long value = PyLong_AsLong(x);
9280
0
        if (value < 0 || value > MAX_UNICODE) {
9281
0
            PyErr_Format(PyExc_ValueError,
9282
0
                         "character mapping must be in range(0x%x)",
9283
0
                         MAX_UNICODE+1);
9284
0
            Py_DECREF(x);
9285
0
            return -1;
9286
0
        }
9287
0
        *result = x;
9288
0
        *replace = (Py_UCS4)value;
9289
0
        return 0;
9290
0
    }
9291
76
    else if (PyUnicode_Check(x)) {
9292
76
        *result = x;
9293
76
        return 0;
9294
76
    }
9295
0
    else {
9296
        /* wrong return value */
9297
0
        PyErr_SetString(PyExc_TypeError,
9298
0
                        "character mapping must return integer, None or str");
9299
0
        Py_DECREF(x);
9300
0
        return -1;
9301
0
    }
9302
76
}
9303
9304
/* lookup the character, write the result into the writer.
9305
   Return 1 if the result was written into the writer, return 0 if the mapping
9306
   was undefined, raise an exception return -1 on error. */
9307
static int
9308
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9309
                        _PyUnicodeWriter *writer)
9310
139
{
9311
139
    PyObject *item;
9312
139
    Py_UCS4 replace;
9313
9314
139
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9315
0
        return -1;
9316
9317
139
    if (item == NULL) {
9318
        /* not found => default to 1:1 mapping */
9319
71
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9320
0
            return -1;
9321
0
        }
9322
71
        return 1;
9323
71
    }
9324
9325
68
    if (item == Py_None) {
9326
0
        Py_DECREF(item);
9327
0
        return 0;
9328
0
    }
9329
9330
68
    if (PyLong_Check(item)) {
9331
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9332
0
            Py_DECREF(item);
9333
0
            return -1;
9334
0
        }
9335
0
        Py_DECREF(item);
9336
0
        return 1;
9337
0
    }
9338
9339
68
    if (!PyUnicode_Check(item)) {
9340
0
        Py_DECREF(item);
9341
0
        return -1;
9342
0
    }
9343
9344
68
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9345
0
        Py_DECREF(item);
9346
0
        return -1;
9347
0
    }
9348
9349
68
    Py_DECREF(item);
9350
68
    return 1;
9351
68
}
9352
9353
static int
9354
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9355
                              Py_UCS1 *translate)
9356
11
{
9357
11
    PyObject *item = NULL;
9358
11
    Py_UCS4 replace;
9359
11
    int ret = 0;
9360
9361
11
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9362
0
        return -1;
9363
0
    }
9364
9365
11
    if (item == Py_None) {
9366
        /* deletion */
9367
0
        translate[ch] = 0xfe;
9368
0
    }
9369
11
    else if (item == NULL) {
9370
        /* not found => default to 1:1 mapping */
9371
3
        translate[ch] = ch;
9372
3
        return 1;
9373
3
    }
9374
8
    else if (PyLong_Check(item)) {
9375
0
        if (replace > 127) {
9376
            /* invalid character or character outside ASCII:
9377
               skip the fast translate */
9378
0
            goto exit;
9379
0
        }
9380
0
        translate[ch] = (Py_UCS1)replace;
9381
0
    }
9382
8
    else if (PyUnicode_Check(item)) {
9383
8
        if (PyUnicode_GET_LENGTH(item) != 1)
9384
8
            goto exit;
9385
9386
0
        replace = PyUnicode_READ_CHAR(item, 0);
9387
0
        if (replace > 127)
9388
0
            goto exit;
9389
0
        translate[ch] = (Py_UCS1)replace;
9390
0
    }
9391
0
    else {
9392
        /* not None, NULL, long or unicode */
9393
0
        goto exit;
9394
0
    }
9395
0
    ret = 1;
9396
9397
8
  exit:
9398
8
    Py_DECREF(item);
9399
8
    return ret;
9400
0
}
9401
9402
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9403
   was translated into writer, return 0 if the input string was partially
9404
   translated into writer, raise an exception and return -1 on error. */
9405
static int
9406
unicode_fast_translate(PyObject *input, PyObject *mapping,
9407
                       _PyUnicodeWriter *writer, int ignore,
9408
                       Py_ssize_t *input_pos)
9409
8
{
9410
8
    Py_UCS1 ascii_table[128], ch, ch2;
9411
8
    Py_ssize_t len;
9412
8
    const Py_UCS1 *in, *end;
9413
8
    Py_UCS1 *out;
9414
8
    int res = 0;
9415
9416
8
    len = PyUnicode_GET_LENGTH(input);
9417
9418
8
    memset(ascii_table, 0xff, 128);
9419
9420
8
    in = PyUnicode_1BYTE_DATA(input);
9421
8
    end = in + len;
9422
9423
8
    assert(PyUnicode_IS_ASCII(writer->buffer));
9424
8
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9425
8
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9426
9427
11
    for (; in < end; in++) {
9428
11
        ch = *in;
9429
11
        ch2 = ascii_table[ch];
9430
11
        if (ch2 == 0xff) {
9431
11
            int translate = unicode_fast_translate_lookup(mapping, ch,
9432
11
                                                          ascii_table);
9433
11
            if (translate < 0)
9434
0
                return -1;
9435
11
            if (translate == 0)
9436
8
                goto exit;
9437
3
            ch2 = ascii_table[ch];
9438
3
        }
9439
3
        if (ch2 == 0xfe) {
9440
0
            if (ignore)
9441
0
                continue;
9442
0
            goto exit;
9443
0
        }
9444
3
        assert(ch2 < 128);
9445
3
        *out = ch2;
9446
3
        out++;
9447
3
    }
9448
0
    res = 1;
9449
9450
8
exit:
9451
8
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9452
8
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9453
8
    return res;
9454
0
}
9455
9456
static PyObject *
9457
_PyUnicode_TranslateCharmap(PyObject *input,
9458
                            PyObject *mapping,
9459
                            const char *errors)
9460
8
{
9461
    /* input object */
9462
8
    const void *data;
9463
8
    Py_ssize_t size, i;
9464
8
    int kind;
9465
    /* output buffer */
9466
8
    _PyUnicodeWriter writer;
9467
    /* error handler */
9468
8
    const char *reason = "character maps to <undefined>";
9469
8
    PyObject *errorHandler = NULL;
9470
8
    PyObject *exc = NULL;
9471
8
    int ignore;
9472
8
    int res;
9473
9474
8
    if (mapping == NULL) {
9475
0
        PyErr_BadArgument();
9476
0
        return NULL;
9477
0
    }
9478
9479
8
    data = PyUnicode_DATA(input);
9480
8
    kind = PyUnicode_KIND(input);
9481
8
    size = PyUnicode_GET_LENGTH(input);
9482
9483
8
    if (size == 0)
9484
0
        return PyUnicode_FromObject(input);
9485
9486
    /* allocate enough for a simple 1:1 translation without
9487
       replacements, if we need more, we'll resize */
9488
8
    _PyUnicodeWriter_Init(&writer);
9489
8
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9490
0
        goto onError;
9491
9492
8
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9493
9494
8
    if (PyUnicode_IS_ASCII(input)) {
9495
8
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9496
8
        if (res < 0) {
9497
0
            _PyUnicodeWriter_Dealloc(&writer);
9498
0
            return NULL;
9499
0
        }
9500
8
        if (res == 1)
9501
0
            return _PyUnicodeWriter_Finish(&writer);
9502
8
    }
9503
0
    else {
9504
0
        i = 0;
9505
0
    }
9506
9507
147
    while (i<size) {
9508
        /* try to encode it */
9509
139
        int translate;
9510
139
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9511
139
        Py_ssize_t newpos;
9512
        /* startpos for collecting untranslatable chars */
9513
139
        Py_ssize_t collstart;
9514
139
        Py_ssize_t collend;
9515
139
        Py_UCS4 ch;
9516
9517
139
        ch = PyUnicode_READ(kind, data, i);
9518
139
        translate = charmaptranslate_output(ch, mapping, &writer);
9519
139
        if (translate < 0)
9520
0
            goto onError;
9521
9522
139
        if (translate != 0) {
9523
            /* it worked => adjust input pointer */
9524
139
            ++i;
9525
139
            continue;
9526
139
        }
9527
9528
        /* untranslatable character */
9529
0
        collstart = i;
9530
0
        collend = i+1;
9531
9532
        /* find all untranslatable characters */
9533
0
        while (collend < size) {
9534
0
            PyObject *x;
9535
0
            Py_UCS4 replace;
9536
0
            ch = PyUnicode_READ(kind, data, collend);
9537
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9538
0
                goto onError;
9539
0
            Py_XDECREF(x);
9540
0
            if (x != Py_None)
9541
0
                break;
9542
0
            ++collend;
9543
0
        }
9544
9545
0
        if (ignore) {
9546
0
            i = collend;
9547
0
        }
9548
0
        else {
9549
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9550
0
                                                             reason, input, &exc,
9551
0
                                                             collstart, collend, &newpos);
9552
0
            if (repunicode == NULL)
9553
0
                goto onError;
9554
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9555
0
                Py_DECREF(repunicode);
9556
0
                goto onError;
9557
0
            }
9558
0
            Py_DECREF(repunicode);
9559
0
            i = newpos;
9560
0
        }
9561
0
    }
9562
8
    Py_XDECREF(exc);
9563
8
    Py_XDECREF(errorHandler);
9564
8
    return _PyUnicodeWriter_Finish(&writer);
9565
9566
0
  onError:
9567
0
    _PyUnicodeWriter_Dealloc(&writer);
9568
0
    Py_XDECREF(exc);
9569
0
    Py_XDECREF(errorHandler);
9570
0
    return NULL;
9571
8
}
9572
9573
PyObject *
9574
PyUnicode_Translate(PyObject *str,
9575
                    PyObject *mapping,
9576
                    const char *errors)
9577
0
{
9578
0
    if (ensure_unicode(str) < 0)
9579
0
        return NULL;
9580
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9581
0
}
9582
9583
PyObject *
9584
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9585
3.75M
{
9586
3.75M
    if (!PyUnicode_Check(unicode)) {
9587
0
        PyErr_BadInternalCall();
9588
0
        return NULL;
9589
0
    }
9590
3.75M
    if (PyUnicode_IS_ASCII(unicode)) {
9591
        /* If the string is already ASCII, just return the same string */
9592
3.75M
        return Py_NewRef(unicode);
9593
3.75M
    }
9594
9595
2.97k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9596
2.97k
    PyObject *result = PyUnicode_New(len, 127);
9597
2.97k
    if (result == NULL) {
9598
0
        return NULL;
9599
0
    }
9600
9601
2.97k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9602
2.97k
    int kind = PyUnicode_KIND(unicode);
9603
2.97k
    const void *data = PyUnicode_DATA(unicode);
9604
2.97k
    Py_ssize_t i;
9605
73.1k
    for (i = 0; i < len; ++i) {
9606
70.3k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9607
70.3k
        if (ch < 127) {
9608
66.9k
            out[i] = ch;
9609
66.9k
        }
9610
3.39k
        else if (Py_UNICODE_ISSPACE(ch)) {
9611
1.02k
            out[i] = ' ';
9612
1.02k
        }
9613
2.36k
        else {
9614
2.36k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9615
2.36k
            if (decimal < 0) {
9616
160
                out[i] = '?';
9617
160
                out[i+1] = '\0';
9618
160
                _PyUnicode_LENGTH(result) = i + 1;
9619
160
                break;
9620
160
            }
9621
2.20k
            out[i] = '0' + decimal;
9622
2.20k
        }
9623
70.3k
    }
9624
9625
2.97k
    assert(_PyUnicode_CheckConsistency(result, 1));
9626
2.97k
    return result;
9627
2.97k
}
9628
9629
/* --- Helpers ------------------------------------------------------------ */
9630
9631
/* helper macro to fixup start/end slice values */
9632
#define ADJUST_INDICES(start, end, len) \
9633
139M
    do {                                \
9634
139M
        if (end > len) {                \
9635
127M
            end = len;                  \
9636
127M
        }                               \
9637
139M
        else if (end < 0) {             \
9638
0
            end += len;                 \
9639
0
            if (end < 0) {              \
9640
0
                end = 0;                \
9641
0
            }                           \
9642
0
        }                               \
9643
139M
        if (start < 0) {                \
9644
0
            start += len;               \
9645
0
            if (start < 0) {            \
9646
0
                start = 0;              \
9647
0
            }                           \
9648
0
        }                               \
9649
139M
    } while (0)
9650
9651
static Py_ssize_t
9652
any_find_slice(PyObject* s1, PyObject* s2,
9653
               Py_ssize_t start,
9654
               Py_ssize_t end,
9655
               int direction)
9656
16.3M
{
9657
16.3M
    int kind1, kind2;
9658
16.3M
    const void *buf1, *buf2;
9659
16.3M
    Py_ssize_t len1, len2, result;
9660
9661
16.3M
    kind1 = PyUnicode_KIND(s1);
9662
16.3M
    kind2 = PyUnicode_KIND(s2);
9663
16.3M
    if (kind1 < kind2)
9664
0
        return -1;
9665
9666
16.3M
    len1 = PyUnicode_GET_LENGTH(s1);
9667
16.3M
    len2 = PyUnicode_GET_LENGTH(s2);
9668
16.3M
    ADJUST_INDICES(start, end, len1);
9669
16.3M
    if (end - start < len2)
9670
25.7k
        return -1;
9671
9672
16.2M
    buf1 = PyUnicode_DATA(s1);
9673
16.2M
    buf2 = PyUnicode_DATA(s2);
9674
16.2M
    if (len2 == 1) {
9675
16.2M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9676
16.2M
        result = findchar((const char *)buf1 + kind1*start,
9677
16.2M
                          kind1, end - start, ch, direction);
9678
16.2M
        if (result == -1)
9679
218k
            return -1;
9680
16.0M
        else
9681
16.0M
            return start + result;
9682
16.2M
    }
9683
9684
0
    if (kind2 != kind1) {
9685
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9686
0
        if (!buf2)
9687
0
            return -2;
9688
0
    }
9689
9690
0
    if (direction > 0) {
9691
0
        switch (kind1) {
9692
0
        case PyUnicode_1BYTE_KIND:
9693
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9694
0
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9695
0
            else
9696
0
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9697
0
            break;
9698
0
        case PyUnicode_2BYTE_KIND:
9699
0
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9700
0
            break;
9701
0
        case PyUnicode_4BYTE_KIND:
9702
0
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9703
0
            break;
9704
0
        default:
9705
0
            Py_UNREACHABLE();
9706
0
        }
9707
0
    }
9708
0
    else {
9709
0
        switch (kind1) {
9710
0
        case PyUnicode_1BYTE_KIND:
9711
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9712
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9713
0
            else
9714
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9715
0
            break;
9716
0
        case PyUnicode_2BYTE_KIND:
9717
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9718
0
            break;
9719
0
        case PyUnicode_4BYTE_KIND:
9720
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9721
0
            break;
9722
0
        default:
9723
0
            Py_UNREACHABLE();
9724
0
        }
9725
0
    }
9726
9727
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9728
0
    if (kind2 != kind1)
9729
0
        PyMem_Free((void *)buf2);
9730
9731
0
    return result;
9732
0
}
9733
9734
/* _PyUnicode_InsertThousandsGrouping() helper functions */
9735
#include "stringlib/localeutil.h"
9736
9737
/**
9738
 * InsertThousandsGrouping:
9739
 * @writer: Unicode writer.
9740
 * @n_buffer: Number of characters in @buffer.
9741
 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9742
 * @d_pos: Start of digits string.
9743
 * @n_digits: The number of digits in the string, in which we want
9744
 *            to put the grouping chars.
9745
 * @min_width: The minimum width of the digits in the output string.
9746
 *             Output will be zero-padded on the left to fill.
9747
 * @grouping: see definition in localeconv().
9748
 * @thousands_sep: see definition in localeconv().
9749
 *
9750
 * There are 2 modes: counting and filling. If @writer is NULL,
9751
 *  we are in counting mode, else filling mode.
9752
 * If counting, the required buffer size is returned.
9753
 * If filling, we know the buffer will be large enough, so we don't
9754
 *  need to pass in the buffer size.
9755
 * Inserts thousand grouping characters (as defined by grouping and
9756
 *  thousands_sep) into @writer.
9757
 *
9758
 * Return value: -1 on error, number of characters otherwise.
9759
 **/
9760
Py_ssize_t
9761
_PyUnicode_InsertThousandsGrouping(
9762
    _PyUnicodeWriter *writer,
9763
    Py_ssize_t n_buffer,
9764
    PyObject *digits,
9765
    Py_ssize_t d_pos,
9766
    Py_ssize_t n_digits,
9767
    Py_ssize_t min_width,
9768
    const char *grouping,
9769
    PyObject *thousands_sep,
9770
    Py_UCS4 *maxchar,
9771
    int forward)
9772
128
{
9773
128
    min_width = Py_MAX(0, min_width);
9774
128
    if (writer) {
9775
64
        assert(digits != NULL);
9776
64
        assert(maxchar == NULL);
9777
64
    }
9778
64
    else {
9779
64
        assert(digits == NULL);
9780
64
        assert(maxchar != NULL);
9781
64
    }
9782
128
    assert(0 <= d_pos);
9783
128
    assert(0 <= n_digits);
9784
128
    assert(grouping != NULL);
9785
9786
128
    Py_ssize_t count = 0;
9787
128
    Py_ssize_t n_zeros;
9788
128
    int loop_broken = 0;
9789
128
    int use_separator = 0; /* First time through, don't append the
9790
                              separator. They only go between
9791
                              groups. */
9792
128
    Py_ssize_t buffer_pos;
9793
128
    Py_ssize_t digits_pos;
9794
128
    Py_ssize_t len;
9795
128
    Py_ssize_t n_chars;
9796
128
    Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9797
                                        be looked at */
9798
    /* A generator that returns all of the grouping widths, until it
9799
       returns 0. */
9800
128
    GroupGenerator groupgen;
9801
128
    GroupGenerator_init(&groupgen, grouping);
9802
128
    const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9803
9804
    /* if digits are not grouped, thousands separator
9805
       should be an empty string */
9806
128
    assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9807
9808
128
    digits_pos = d_pos + (forward ? 0 : n_digits);
9809
128
    if (writer) {
9810
64
        buffer_pos = writer->pos + (forward ? 0 : n_buffer);
9811
64
        assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9812
64
        assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9813
64
    }
9814
64
    else {
9815
64
        buffer_pos = forward ? 0 : n_buffer;
9816
64
    }
9817
9818
128
    if (!writer) {
9819
64
        *maxchar = 127;
9820
64
    }
9821
9822
128
    while ((len = GroupGenerator_next(&groupgen)) > 0) {
9823
0
        len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9824
0
        n_zeros = Py_MAX(0, len - remaining);
9825
0
        n_chars = Py_MAX(0, Py_MIN(remaining, len));
9826
9827
        /* Use n_zero zero's and n_chars chars */
9828
9829
        /* Count only, don't do anything. */
9830
0
        count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9831
9832
        /* Copy into the writer. */
9833
0
        InsertThousandsGrouping_fill(writer, &buffer_pos,
9834
0
                                     digits, &digits_pos,
9835
0
                                     n_chars, n_zeros,
9836
0
                                     use_separator ? thousands_sep : NULL,
9837
0
                                     thousands_sep_len, maxchar, forward);
9838
9839
        /* Use a separator next time. */
9840
0
        use_separator = 1;
9841
9842
0
        remaining -= n_chars;
9843
0
        min_width -= len;
9844
9845
0
        if (remaining <= 0 && min_width <= 0) {
9846
0
            loop_broken = 1;
9847
0
            break;
9848
0
        }
9849
0
        min_width -= thousands_sep_len;
9850
0
    }
9851
128
    if (!loop_broken) {
9852
        /* We left the loop without using a break statement. */
9853
9854
128
        len = Py_MAX(Py_MAX(remaining, min_width), 1);
9855
128
        n_zeros = Py_MAX(0, len - remaining);
9856
128
        n_chars = Py_MAX(0, Py_MIN(remaining, len));
9857
9858
        /* Use n_zero zero's and n_chars chars */
9859
128
        count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9860
9861
        /* Copy into the writer. */
9862
128
        InsertThousandsGrouping_fill(writer, &buffer_pos,
9863
128
                                     digits, &digits_pos,
9864
128
                                     n_chars, n_zeros,
9865
128
                                     use_separator ? thousands_sep : NULL,
9866
128
                                     thousands_sep_len, maxchar, forward);
9867
128
    }
9868
128
    return count;
9869
128
}
9870
9871
Py_ssize_t
9872
PyUnicode_Count(PyObject *str,
9873
                PyObject *substr,
9874
                Py_ssize_t start,
9875
                Py_ssize_t end)
9876
0
{
9877
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9878
0
        return -1;
9879
9880
0
    return unicode_count_impl(str, substr, start, end);
9881
0
}
9882
9883
Py_ssize_t
9884
PyUnicode_Find(PyObject *str,
9885
               PyObject *substr,
9886
               Py_ssize_t start,
9887
               Py_ssize_t end,
9888
               int direction)
9889
0
{
9890
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9891
0
        return -2;
9892
9893
0
    return any_find_slice(str, substr, start, end, direction);
9894
0
}
9895
9896
Py_ssize_t
9897
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9898
                   Py_ssize_t start, Py_ssize_t end,
9899
                   int direction)
9900
534k
{
9901
534k
    int kind;
9902
534k
    Py_ssize_t len, result;
9903
534k
    len = PyUnicode_GET_LENGTH(str);
9904
534k
    ADJUST_INDICES(start, end, len);
9905
534k
    if (end - start < 1)
9906
0
        return -1;
9907
534k
    kind = PyUnicode_KIND(str);
9908
534k
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9909
534k
                      kind, end-start, ch, direction);
9910
534k
    if (result == -1)
9911
54.0k
        return -1;
9912
480k
    else
9913
480k
        return start + result;
9914
534k
}
9915
9916
static int
9917
tailmatch(PyObject *self,
9918
          PyObject *substring,
9919
          Py_ssize_t start,
9920
          Py_ssize_t end,
9921
          int direction)
9922
103M
{
9923
103M
    int kind_self;
9924
103M
    int kind_sub;
9925
103M
    const void *data_self;
9926
103M
    const void *data_sub;
9927
103M
    Py_ssize_t offset;
9928
103M
    Py_ssize_t i;
9929
103M
    Py_ssize_t end_sub;
9930
9931
103M
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9932
103M
    end -= PyUnicode_GET_LENGTH(substring);
9933
103M
    if (end < start)
9934
12.4M
        return 0;
9935
9936
91.4M
    if (PyUnicode_GET_LENGTH(substring) == 0)
9937
0
        return 1;
9938
9939
91.4M
    kind_self = PyUnicode_KIND(self);
9940
91.4M
    data_self = PyUnicode_DATA(self);
9941
91.4M
    kind_sub = PyUnicode_KIND(substring);
9942
91.4M
    data_sub = PyUnicode_DATA(substring);
9943
91.4M
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9944
9945
91.4M
    if (direction > 0)
9946
7.67M
        offset = end;
9947
83.7M
    else
9948
83.7M
        offset = start;
9949
9950
91.4M
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9951
91.4M
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9952
91.4M
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9953
45.5M
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9954
        /* If both are of the same kind, memcmp is sufficient */
9955
13.6M
        if (kind_self == kind_sub) {
9956
10.0M
            return ! memcmp((char *)data_self +
9957
10.0M
                                (offset * PyUnicode_KIND(substring)),
9958
10.0M
                            data_sub,
9959
10.0M
                            PyUnicode_GET_LENGTH(substring) *
9960
10.0M
                                PyUnicode_KIND(substring));
9961
10.0M
        }
9962
        /* otherwise we have to compare each character by first accessing it */
9963
3.62M
        else {
9964
            /* We do not need to compare 0 and len(substring)-1 because
9965
               the if statement above ensured already that they are equal
9966
               when we end up here. */
9967
3.71M
            for (i = 1; i < end_sub; ++i) {
9968
95.3k
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9969
95.3k
                    PyUnicode_READ(kind_sub, data_sub, i))
9970
8.18k
                    return 0;
9971
95.3k
            }
9972
3.61M
            return 1;
9973
3.62M
        }
9974
13.6M
    }
9975
9976
77.7M
    return 0;
9977
91.4M
}
9978
9979
Py_ssize_t
9980
PyUnicode_Tailmatch(PyObject *str,
9981
                    PyObject *substr,
9982
                    Py_ssize_t start,
9983
                    Py_ssize_t end,
9984
                    int direction)
9985
0
{
9986
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9987
0
        return -1;
9988
9989
0
    return tailmatch(str, substr, start, end, direction);
9990
0
}
9991
9992
static PyObject *
9993
ascii_upper_or_lower(PyObject *self, int lower)
9994
90.6M
{
9995
90.6M
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9996
90.6M
    const char *data = PyUnicode_DATA(self);
9997
90.6M
    char *resdata;
9998
90.6M
    PyObject *res;
9999
10000
90.6M
    res = PyUnicode_New(len, 127);
10001
90.6M
    if (res == NULL)
10002
0
        return NULL;
10003
90.6M
    resdata = PyUnicode_DATA(res);
10004
90.6M
    if (lower)
10005
90.6M
        _Py_bytes_lower(resdata, data, len);
10006
0
    else
10007
0
        _Py_bytes_upper(resdata, data, len);
10008
90.6M
    return res;
10009
90.6M
}
10010
10011
static Py_UCS4
10012
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
10013
29.4k
{
10014
29.4k
    Py_ssize_t j;
10015
29.4k
    int final_sigma;
10016
29.4k
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
10017
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
10018
10019
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10020
10021
    where ! is a negation and \p{xxx} is a character with property xxx.
10022
    */
10023
76.3k
    for (j = i - 1; j >= 0; j--) {
10024
74.8k
        c = PyUnicode_READ(kind, data, j);
10025
74.8k
        if (!_PyUnicode_IsCaseIgnorable(c))
10026
27.9k
            break;
10027
74.8k
    }
10028
29.4k
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10029
29.4k
    if (final_sigma) {
10030
59.3k
        for (j = i + 1; j < length; j++) {
10031
58.4k
            c = PyUnicode_READ(kind, data, j);
10032
58.4k
            if (!_PyUnicode_IsCaseIgnorable(c))
10033
21.0k
                break;
10034
58.4k
        }
10035
21.9k
        final_sigma = j == length || !_PyUnicode_IsCased(c);
10036
21.9k
    }
10037
29.4k
    return (final_sigma) ? 0x3C2 : 0x3C3;
10038
29.4k
}
10039
10040
static int
10041
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
10042
           Py_UCS4 c, Py_UCS4 *mapped)
10043
98.3M
{
10044
    /* Obscure special case. */
10045
98.3M
    if (c == 0x3A3) {
10046
29.4k
        mapped[0] = handle_capital_sigma(kind, data, length, i);
10047
29.4k
        return 1;
10048
29.4k
    }
10049
98.3M
    return _PyUnicode_ToLowerFull(c, mapped);
10050
98.3M
}
10051
10052
static Py_ssize_t
10053
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10054
0
{
10055
0
    Py_ssize_t i, k = 0;
10056
0
    int n_res, j;
10057
0
    Py_UCS4 c, mapped[3];
10058
10059
0
    c = PyUnicode_READ(kind, data, 0);
10060
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
10061
0
    for (j = 0; j < n_res; j++) {
10062
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
10063
0
        res[k++] = mapped[j];
10064
0
    }
10065
0
    for (i = 1; i < length; i++) {
10066
0
        c = PyUnicode_READ(kind, data, i);
10067
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
10068
0
        for (j = 0; j < n_res; j++) {
10069
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
10070
0
            res[k++] = mapped[j];
10071
0
        }
10072
0
    }
10073
0
    return k;
10074
0
}
10075
10076
static Py_ssize_t
10077
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
10078
0
    Py_ssize_t i, k = 0;
10079
10080
0
    for (i = 0; i < length; i++) {
10081
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10082
0
        int n_res, j;
10083
0
        if (Py_UNICODE_ISUPPER(c)) {
10084
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
10085
0
        }
10086
0
        else if (Py_UNICODE_ISLOWER(c)) {
10087
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
10088
0
        }
10089
0
        else {
10090
0
            n_res = 1;
10091
0
            mapped[0] = c;
10092
0
        }
10093
0
        for (j = 0; j < n_res; j++) {
10094
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
10095
0
            res[k++] = mapped[j];
10096
0
        }
10097
0
    }
10098
0
    return k;
10099
0
}
10100
10101
static Py_ssize_t
10102
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
10103
                  Py_UCS4 *maxchar, int lower)
10104
26.9M
{
10105
26.9M
    Py_ssize_t i, k = 0;
10106
10107
125M
    for (i = 0; i < length; i++) {
10108
98.3M
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10109
98.3M
        int n_res, j;
10110
98.3M
        if (lower)
10111
98.3M
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
10112
0
        else
10113
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
10114
196M
        for (j = 0; j < n_res; j++) {
10115
98.3M
            *maxchar = Py_MAX(*maxchar, mapped[j]);
10116
98.3M
            res[k++] = mapped[j];
10117
98.3M
        }
10118
98.3M
    }
10119
26.9M
    return k;
10120
26.9M
}
10121
10122
static Py_ssize_t
10123
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10124
0
{
10125
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10126
0
}
10127
10128
static Py_ssize_t
10129
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10130
26.9M
{
10131
26.9M
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10132
26.9M
}
10133
10134
static Py_ssize_t
10135
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10136
0
{
10137
0
    Py_ssize_t i, k = 0;
10138
10139
0
    for (i = 0; i < length; i++) {
10140
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
10141
0
        Py_UCS4 mapped[3];
10142
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10143
0
        for (j = 0; j < n_res; j++) {
10144
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
10145
0
            res[k++] = mapped[j];
10146
0
        }
10147
0
    }
10148
0
    return k;
10149
0
}
10150
10151
static Py_ssize_t
10152
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
10153
0
{
10154
0
    Py_ssize_t i, k = 0;
10155
0
    int previous_is_cased;
10156
10157
0
    previous_is_cased = 0;
10158
0
    for (i = 0; i < length; i++) {
10159
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10160
0
        Py_UCS4 mapped[3];
10161
0
        int n_res, j;
10162
10163
0
        if (previous_is_cased)
10164
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
10165
0
        else
10166
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
10167
10168
0
        for (j = 0; j < n_res; j++) {
10169
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
10170
0
            res[k++] = mapped[j];
10171
0
        }
10172
10173
0
        previous_is_cased = _PyUnicode_IsCased(c);
10174
0
    }
10175
0
    return k;
10176
0
}
10177
10178
static PyObject *
10179
case_operation(PyObject *self,
10180
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
10181
26.9M
{
10182
26.9M
    PyObject *res = NULL;
10183
26.9M
    Py_ssize_t length, newlength = 0;
10184
26.9M
    int kind, outkind;
10185
26.9M
    const void *data;
10186
26.9M
    void *outdata;
10187
26.9M
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
10188
10189
26.9M
    kind = PyUnicode_KIND(self);
10190
26.9M
    data = PyUnicode_DATA(self);
10191
26.9M
    length = PyUnicode_GET_LENGTH(self);
10192
26.9M
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
10193
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
10194
0
        return NULL;
10195
0
    }
10196
26.9M
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
10197
26.9M
    if (tmp == NULL)
10198
0
        return PyErr_NoMemory();
10199
26.9M
    newlength = perform(kind, data, length, tmp, &maxchar);
10200
26.9M
    res = PyUnicode_New(newlength, maxchar);
10201
26.9M
    if (res == NULL)
10202
0
        goto leave;
10203
26.9M
    tmpend = tmp + newlength;
10204
26.9M
    outdata = PyUnicode_DATA(res);
10205
26.9M
    outkind = PyUnicode_KIND(res);
10206
26.9M
    switch (outkind) {
10207
238k
    case PyUnicode_1BYTE_KIND:
10208
238k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10209
238k
        break;
10210
26.6M
    case PyUnicode_2BYTE_KIND:
10211
26.6M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10212
26.6M
        break;
10213
61.5k
    case PyUnicode_4BYTE_KIND:
10214
61.5k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10215
61.5k
        break;
10216
0
    default:
10217
0
        Py_UNREACHABLE();
10218
26.9M
    }
10219
26.9M
  leave:
10220
26.9M
    PyMem_Free(tmp);
10221
26.9M
    return res;
10222
26.9M
}
10223
10224
PyObject *
10225
PyUnicode_Join(PyObject *separator, PyObject *seq)
10226
22.8M
{
10227
22.8M
    PyObject *res;
10228
22.8M
    PyObject *fseq;
10229
22.8M
    Py_ssize_t seqlen;
10230
22.8M
    PyObject **items;
10231
10232
22.8M
    fseq = PySequence_Fast(seq, "can only join an iterable");
10233
22.8M
    if (fseq == NULL) {
10234
703
        return NULL;
10235
703
    }
10236
10237
22.8M
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
10238
10239
22.8M
    items = PySequence_Fast_ITEMS(fseq);
10240
22.8M
    seqlen = PySequence_Fast_GET_SIZE(fseq);
10241
22.8M
    res = _PyUnicode_JoinArray(separator, items, seqlen);
10242
10243
22.8M
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
10244
10245
22.8M
    Py_DECREF(fseq);
10246
22.8M
    return res;
10247
22.8M
}
10248
10249
PyObject *
10250
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
10251
48.8M
{
10252
48.8M
    PyObject *res = NULL; /* the result */
10253
48.8M
    PyObject *sep = NULL;
10254
48.8M
    Py_ssize_t seplen;
10255
48.8M
    PyObject *item;
10256
48.8M
    Py_ssize_t sz, i, res_offset;
10257
48.8M
    Py_UCS4 maxchar;
10258
48.8M
    Py_UCS4 item_maxchar;
10259
48.8M
    int use_memcpy;
10260
48.8M
    unsigned char *res_data = NULL, *sep_data = NULL;
10261
48.8M
    PyObject *last_obj;
10262
48.8M
    int kind = 0;
10263
10264
    /* If empty sequence, return u"". */
10265
48.8M
    if (seqlen == 0) {
10266
5.39M
        _Py_RETURN_UNICODE_EMPTY();
10267
5.39M
    }
10268
10269
    /* If singleton sequence with an exact Unicode, return that. */
10270
43.5M
    last_obj = NULL;
10271
43.5M
    if (seqlen == 1) {
10272
7.30M
        if (PyUnicode_CheckExact(items[0])) {
10273
5.66M
            res = items[0];
10274
5.66M
            return Py_NewRef(res);
10275
5.66M
        }
10276
1.64M
        seplen = 0;
10277
1.64M
        maxchar = 0;
10278
1.64M
    }
10279
36.1M
    else {
10280
        /* Set up sep and seplen */
10281
36.1M
        if (separator == NULL) {
10282
            /* fall back to a blank space separator */
10283
0
            sep = PyUnicode_FromOrdinal(' ');
10284
0
            if (!sep)
10285
0
                goto onError;
10286
0
            seplen = 1;
10287
0
            maxchar = 32;
10288
0
        }
10289
36.1M
        else {
10290
36.1M
            if (!PyUnicode_Check(separator)) {
10291
0
                PyErr_Format(PyExc_TypeError,
10292
0
                             "separator: expected str instance,"
10293
0
                             " %.80s found",
10294
0
                             Py_TYPE(separator)->tp_name);
10295
0
                goto onError;
10296
0
            }
10297
36.1M
            sep = separator;
10298
36.1M
            seplen = PyUnicode_GET_LENGTH(separator);
10299
36.1M
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10300
            /* inc refcount to keep this code path symmetric with the
10301
               above case of a blank separator */
10302
36.1M
            Py_INCREF(sep);
10303
36.1M
        }
10304
36.1M
        last_obj = sep;
10305
36.1M
    }
10306
10307
    /* There are at least two things to join, or else we have a subclass
10308
     * of str in the sequence.
10309
     * Do a pre-pass to figure out the total amount of space we'll
10310
     * need (sz), and see whether all argument are strings.
10311
     */
10312
37.8M
    sz = 0;
10313
#ifdef Py_DEBUG
10314
    use_memcpy = 0;
10315
#else
10316
37.8M
    use_memcpy = 1;
10317
37.8M
#endif
10318
369M
    for (i = 0; i < seqlen; i++) {
10319
331M
        size_t add_sz;
10320
331M
        item = items[i];
10321
331M
        if (!PyUnicode_Check(item)) {
10322
0
            PyErr_Format(PyExc_TypeError,
10323
0
                         "sequence item %zd: expected str instance,"
10324
0
                         " %.80s found",
10325
0
                         i, Py_TYPE(item)->tp_name);
10326
0
            goto onError;
10327
0
        }
10328
331M
        add_sz = PyUnicode_GET_LENGTH(item);
10329
331M
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10330
331M
        maxchar = Py_MAX(maxchar, item_maxchar);
10331
331M
        if (i != 0) {
10332
293M
            add_sz += seplen;
10333
293M
        }
10334
331M
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10335
0
            PyErr_SetString(PyExc_OverflowError,
10336
0
                            "join() result is too long for a Python string");
10337
0
            goto onError;
10338
0
        }
10339
331M
        sz += add_sz;
10340
331M
        if (use_memcpy && last_obj != NULL) {
10341
262M
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10342
3.79M
                use_memcpy = 0;
10343
262M
        }
10344
331M
        last_obj = item;
10345
331M
    }
10346
10347
37.8M
    res = PyUnicode_New(sz, maxchar);
10348
37.8M
    if (res == NULL)
10349
0
        goto onError;
10350
10351
    /* Catenate everything. */
10352
#ifdef Py_DEBUG
10353
    use_memcpy = 0;
10354
#else
10355
37.8M
    if (use_memcpy) {
10356
34.0M
        res_data = PyUnicode_1BYTE_DATA(res);
10357
34.0M
        kind = PyUnicode_KIND(res);
10358
34.0M
        if (seplen != 0)
10359
15.8k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10360
34.0M
    }
10361
37.8M
#endif
10362
37.8M
    if (use_memcpy) {
10363
267M
        for (i = 0; i < seqlen; ++i) {
10364
233M
            Py_ssize_t itemlen;
10365
233M
            item = items[i];
10366
10367
            /* Copy item, and maybe the separator. */
10368
233M
            if (i && seplen != 0) {
10369
20.2k
                memcpy(res_data,
10370
20.2k
                          sep_data,
10371
20.2k
                          kind * seplen);
10372
20.2k
                res_data += kind * seplen;
10373
20.2k
            }
10374
10375
233M
            itemlen = PyUnicode_GET_LENGTH(item);
10376
233M
            if (itemlen != 0) {
10377
208M
                memcpy(res_data,
10378
208M
                          PyUnicode_DATA(item),
10379
208M
                          kind * itemlen);
10380
208M
                res_data += kind * itemlen;
10381
208M
            }
10382
233M
        }
10383
34.0M
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10384
34.0M
                           + kind * PyUnicode_GET_LENGTH(res));
10385
34.0M
    }
10386
3.79M
    else {
10387
101M
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10388
97.4M
            Py_ssize_t itemlen;
10389
97.4M
            item = items[i];
10390
10391
            /* Copy item, and maybe the separator. */
10392
97.4M
            if (i && seplen != 0) {
10393
70.3k
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10394
70.3k
                res_offset += seplen;
10395
70.3k
            }
10396
10397
97.4M
            itemlen = PyUnicode_GET_LENGTH(item);
10398
97.4M
            if (itemlen != 0) {
10399
97.1M
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10400
97.1M
                res_offset += itemlen;
10401
97.1M
            }
10402
97.4M
        }
10403
3.79M
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10404
3.79M
    }
10405
10406
37.8M
    Py_XDECREF(sep);
10407
37.8M
    assert(_PyUnicode_CheckConsistency(res, 1));
10408
37.8M
    return res;
10409
10410
0
  onError:
10411
0
    Py_XDECREF(sep);
10412
0
    Py_XDECREF(res);
10413
0
    return NULL;
10414
37.8M
}
10415
10416
void
10417
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10418
                    Py_UCS4 fill_char)
10419
698
{
10420
698
    const int kind = PyUnicode_KIND(unicode);
10421
698
    void *data = PyUnicode_DATA(unicode);
10422
698
    assert(unicode_modifiable(unicode));
10423
698
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10424
698
    assert(start >= 0);
10425
698
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10426
698
    unicode_fill(kind, data, fill_char, start, length);
10427
698
}
10428
10429
Py_ssize_t
10430
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10431
               Py_UCS4 fill_char)
10432
698
{
10433
698
    Py_ssize_t maxlen;
10434
10435
698
    if (!PyUnicode_Check(unicode)) {
10436
0
        PyErr_BadInternalCall();
10437
0
        return -1;
10438
0
    }
10439
698
    if (unicode_check_modifiable(unicode))
10440
0
        return -1;
10441
10442
698
    if (start < 0) {
10443
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10444
0
        return -1;
10445
0
    }
10446
698
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10447
0
        PyErr_SetString(PyExc_ValueError,
10448
0
                         "fill character is bigger than "
10449
0
                         "the string maximum character");
10450
0
        return -1;
10451
0
    }
10452
10453
698
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10454
698
    length = Py_MIN(maxlen, length);
10455
698
    if (length <= 0)
10456
0
        return 0;
10457
10458
698
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10459
698
    return length;
10460
698
}
10461
10462
static PyObject *
10463
pad(PyObject *self,
10464
    Py_ssize_t left,
10465
    Py_ssize_t right,
10466
    Py_UCS4 fill)
10467
0
{
10468
0
    PyObject *u;
10469
0
    Py_UCS4 maxchar;
10470
0
    int kind;
10471
0
    void *data;
10472
10473
0
    if (left < 0)
10474
0
        left = 0;
10475
0
    if (right < 0)
10476
0
        right = 0;
10477
10478
0
    if (left == 0 && right == 0)
10479
0
        return unicode_result_unchanged(self);
10480
10481
0
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10482
0
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10483
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10484
0
        return NULL;
10485
0
    }
10486
0
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10487
0
    maxchar = Py_MAX(maxchar, fill);
10488
0
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10489
0
    if (!u)
10490
0
        return NULL;
10491
10492
0
    kind = PyUnicode_KIND(u);
10493
0
    data = PyUnicode_DATA(u);
10494
0
    if (left)
10495
0
        unicode_fill(kind, data, fill, 0, left);
10496
0
    if (right)
10497
0
        unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10498
0
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10499
0
    assert(_PyUnicode_CheckConsistency(u, 1));
10500
0
    return u;
10501
0
}
10502
10503
PyObject *
10504
PyUnicode_Splitlines(PyObject *string, int keepends)
10505
13.8k
{
10506
13.8k
    PyObject *list;
10507
10508
13.8k
    if (ensure_unicode(string) < 0)
10509
0
        return NULL;
10510
10511
13.8k
    switch (PyUnicode_KIND(string)) {
10512
3.54k
    case PyUnicode_1BYTE_KIND:
10513
3.54k
        if (PyUnicode_IS_ASCII(string))
10514
2.70k
            list = asciilib_splitlines(
10515
2.70k
                string, PyUnicode_1BYTE_DATA(string),
10516
2.70k
                PyUnicode_GET_LENGTH(string), keepends);
10517
845
        else
10518
845
            list = ucs1lib_splitlines(
10519
845
                string, PyUnicode_1BYTE_DATA(string),
10520
845
                PyUnicode_GET_LENGTH(string), keepends);
10521
3.54k
        break;
10522
7.25k
    case PyUnicode_2BYTE_KIND:
10523
7.25k
        list = ucs2lib_splitlines(
10524
7.25k
            string, PyUnicode_2BYTE_DATA(string),
10525
7.25k
            PyUnicode_GET_LENGTH(string), keepends);
10526
7.25k
        break;
10527
3.08k
    case PyUnicode_4BYTE_KIND:
10528
3.08k
        list = ucs4lib_splitlines(
10529
3.08k
            string, PyUnicode_4BYTE_DATA(string),
10530
3.08k
            PyUnicode_GET_LENGTH(string), keepends);
10531
3.08k
        break;
10532
0
    default:
10533
0
        Py_UNREACHABLE();
10534
13.8k
    }
10535
13.8k
    return list;
10536
13.8k
}
10537
10538
static PyObject *
10539
split(PyObject *self,
10540
      PyObject *substring,
10541
      Py_ssize_t maxcount)
10542
24.2M
{
10543
24.2M
    int kind1, kind2;
10544
24.2M
    const void *buf1, *buf2;
10545
24.2M
    Py_ssize_t len1, len2;
10546
24.2M
    PyObject* out;
10547
24.2M
    len1 = PyUnicode_GET_LENGTH(self);
10548
24.2M
    kind1 = PyUnicode_KIND(self);
10549
10550
24.2M
    if (substring == NULL) {
10551
165k
        if (maxcount < 0) {
10552
140k
            maxcount = (len1 - 1) / 2 + 1;
10553
140k
        }
10554
165k
        switch (kind1) {
10555
107k
        case PyUnicode_1BYTE_KIND:
10556
107k
            if (PyUnicode_IS_ASCII(self))
10557
76.7k
                return asciilib_split_whitespace(
10558
76.7k
                    self,  PyUnicode_1BYTE_DATA(self),
10559
76.7k
                    len1, maxcount
10560
76.7k
                    );
10561
30.9k
            else
10562
30.9k
                return ucs1lib_split_whitespace(
10563
30.9k
                    self,  PyUnicode_1BYTE_DATA(self),
10564
30.9k
                    len1, maxcount
10565
30.9k
                    );
10566
47.7k
        case PyUnicode_2BYTE_KIND:
10567
47.7k
            return ucs2lib_split_whitespace(
10568
47.7k
                self,  PyUnicode_2BYTE_DATA(self),
10569
47.7k
                len1, maxcount
10570
47.7k
                );
10571
10.4k
        case PyUnicode_4BYTE_KIND:
10572
10.4k
            return ucs4lib_split_whitespace(
10573
10.4k
                self,  PyUnicode_4BYTE_DATA(self),
10574
10.4k
                len1, maxcount
10575
10.4k
                );
10576
0
        default:
10577
0
            Py_UNREACHABLE();
10578
165k
        }
10579
165k
    }
10580
10581
24.1M
    kind2 = PyUnicode_KIND(substring);
10582
24.1M
    len2 = PyUnicode_GET_LENGTH(substring);
10583
24.1M
    if (maxcount < 0) {
10584
        // if len2 == 0, it will raise ValueError.
10585
12.6M
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10586
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10587
12.6M
        maxcount = maxcount < 0 ? len1 : maxcount;
10588
12.6M
    }
10589
24.1M
    if (kind1 < kind2 || len1 < len2) {
10590
6.59M
        out = PyList_New(1);
10591
6.59M
        if (out == NULL)
10592
0
            return NULL;
10593
6.59M
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10594
6.59M
        return out;
10595
6.59M
    }
10596
17.5M
    buf1 = PyUnicode_DATA(self);
10597
17.5M
    buf2 = PyUnicode_DATA(substring);
10598
17.5M
    if (kind2 != kind1) {
10599
211k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10600
211k
        if (!buf2)
10601
0
            return NULL;
10602
211k
    }
10603
10604
17.5M
    switch (kind1) {
10605
17.3M
    case PyUnicode_1BYTE_KIND:
10606
17.3M
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10607
16.2M
            out = asciilib_split(
10608
16.2M
                self,  buf1, len1, buf2, len2, maxcount);
10609
1.07M
        else
10610
1.07M
            out = ucs1lib_split(
10611
1.07M
                self,  buf1, len1, buf2, len2, maxcount);
10612
17.3M
        break;
10613
178k
    case PyUnicode_2BYTE_KIND:
10614
178k
        out = ucs2lib_split(
10615
178k
            self,  buf1, len1, buf2, len2, maxcount);
10616
178k
        break;
10617
33.5k
    case PyUnicode_4BYTE_KIND:
10618
33.5k
        out = ucs4lib_split(
10619
33.5k
            self,  buf1, len1, buf2, len2, maxcount);
10620
33.5k
        break;
10621
0
    default:
10622
0
        out = NULL;
10623
17.5M
    }
10624
17.5M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10625
17.5M
    if (kind2 != kind1)
10626
211k
        PyMem_Free((void *)buf2);
10627
17.5M
    return out;
10628
17.5M
}
10629
10630
static PyObject *
10631
rsplit(PyObject *self,
10632
       PyObject *substring,
10633
       Py_ssize_t maxcount)
10634
50
{
10635
50
    int kind1, kind2;
10636
50
    const void *buf1, *buf2;
10637
50
    Py_ssize_t len1, len2;
10638
50
    PyObject* out;
10639
10640
50
    len1 = PyUnicode_GET_LENGTH(self);
10641
50
    kind1 = PyUnicode_KIND(self);
10642
10643
50
    if (substring == NULL) {
10644
0
        if (maxcount < 0) {
10645
0
            maxcount = (len1 - 1) / 2 + 1;
10646
0
        }
10647
0
        switch (kind1) {
10648
0
        case PyUnicode_1BYTE_KIND:
10649
0
            if (PyUnicode_IS_ASCII(self))
10650
0
                return asciilib_rsplit_whitespace(
10651
0
                    self,  PyUnicode_1BYTE_DATA(self),
10652
0
                    len1, maxcount
10653
0
                    );
10654
0
            else
10655
0
                return ucs1lib_rsplit_whitespace(
10656
0
                    self,  PyUnicode_1BYTE_DATA(self),
10657
0
                    len1, maxcount
10658
0
                    );
10659
0
        case PyUnicode_2BYTE_KIND:
10660
0
            return ucs2lib_rsplit_whitespace(
10661
0
                self,  PyUnicode_2BYTE_DATA(self),
10662
0
                len1, maxcount
10663
0
                );
10664
0
        case PyUnicode_4BYTE_KIND:
10665
0
            return ucs4lib_rsplit_whitespace(
10666
0
                self,  PyUnicode_4BYTE_DATA(self),
10667
0
                len1, maxcount
10668
0
                );
10669
0
        default:
10670
0
            Py_UNREACHABLE();
10671
0
        }
10672
0
    }
10673
50
    kind2 = PyUnicode_KIND(substring);
10674
50
    len2 = PyUnicode_GET_LENGTH(substring);
10675
50
    if (maxcount < 0) {
10676
        // if len2 == 0, it will raise ValueError.
10677
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10678
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10679
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10680
0
    }
10681
50
    if (kind1 < kind2 || len1 < len2) {
10682
0
        out = PyList_New(1);
10683
0
        if (out == NULL)
10684
0
            return NULL;
10685
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10686
0
        return out;
10687
0
    }
10688
50
    buf1 = PyUnicode_DATA(self);
10689
50
    buf2 = PyUnicode_DATA(substring);
10690
50
    if (kind2 != kind1) {
10691
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10692
0
        if (!buf2)
10693
0
            return NULL;
10694
0
    }
10695
10696
50
    switch (kind1) {
10697
50
    case PyUnicode_1BYTE_KIND:
10698
50
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10699
50
            out = asciilib_rsplit(
10700
50
                self,  buf1, len1, buf2, len2, maxcount);
10701
0
        else
10702
0
            out = ucs1lib_rsplit(
10703
0
                self,  buf1, len1, buf2, len2, maxcount);
10704
50
        break;
10705
0
    case PyUnicode_2BYTE_KIND:
10706
0
        out = ucs2lib_rsplit(
10707
0
            self,  buf1, len1, buf2, len2, maxcount);
10708
0
        break;
10709
0
    case PyUnicode_4BYTE_KIND:
10710
0
        out = ucs4lib_rsplit(
10711
0
            self,  buf1, len1, buf2, len2, maxcount);
10712
0
        break;
10713
0
    default:
10714
0
        out = NULL;
10715
50
    }
10716
50
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10717
50
    if (kind2 != kind1)
10718
0
        PyMem_Free((void *)buf2);
10719
50
    return out;
10720
50
}
10721
10722
static Py_ssize_t
10723
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10724
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10725
152M
{
10726
152M
    switch (kind) {
10727
19.5M
    case PyUnicode_1BYTE_KIND:
10728
19.5M
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10729
16.1M
            return asciilib_find(buf1, len1, buf2, len2, offset);
10730
3.41M
        else
10731
3.41M
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10732
56.9M
    case PyUnicode_2BYTE_KIND:
10733
56.9M
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10734
75.5M
    case PyUnicode_4BYTE_KIND:
10735
75.5M
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10736
152M
    }
10737
152M
    Py_UNREACHABLE();
10738
152M
}
10739
10740
static Py_ssize_t
10741
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10742
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10743
47.3M
{
10744
47.3M
    switch (kind) {
10745
43.0M
    case PyUnicode_1BYTE_KIND:
10746
43.0M
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10747
4.23M
    case PyUnicode_2BYTE_KIND:
10748
4.23M
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10749
129k
    case PyUnicode_4BYTE_KIND:
10750
129k
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10751
47.3M
    }
10752
47.3M
    Py_UNREACHABLE();
10753
47.3M
}
10754
10755
static void
10756
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10757
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10758
1.05M
{
10759
1.05M
    int kind = PyUnicode_KIND(u);
10760
1.05M
    void *data = PyUnicode_DATA(u);
10761
1.05M
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10762
1.05M
    if (kind == PyUnicode_1BYTE_KIND) {
10763
384k
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10764
384k
                                      (Py_UCS1 *)data + len,
10765
384k
                                      u1, u2, maxcount);
10766
384k
    }
10767
672k
    else if (kind == PyUnicode_2BYTE_KIND) {
10768
659k
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10769
659k
                                      (Py_UCS2 *)data + len,
10770
659k
                                      u1, u2, maxcount);
10771
659k
    }
10772
13.4k
    else {
10773
13.4k
        assert(kind == PyUnicode_4BYTE_KIND);
10774
13.4k
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10775
13.4k
                                      (Py_UCS4 *)data + len,
10776
13.4k
                                      u1, u2, maxcount);
10777
13.4k
    }
10778
1.05M
}
10779
10780
static PyObject *
10781
replace(PyObject *self, PyObject *str1,
10782
        PyObject *str2, Py_ssize_t maxcount)
10783
88.8M
{
10784
88.8M
    PyObject *u;
10785
88.8M
    const char *sbuf = PyUnicode_DATA(self);
10786
88.8M
    const void *buf1 = PyUnicode_DATA(str1);
10787
88.8M
    const void *buf2 = PyUnicode_DATA(str2);
10788
88.8M
    int srelease = 0, release1 = 0, release2 = 0;
10789
88.8M
    int skind = PyUnicode_KIND(self);
10790
88.8M
    int kind1 = PyUnicode_KIND(str1);
10791
88.8M
    int kind2 = PyUnicode_KIND(str2);
10792
88.8M
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10793
88.8M
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10794
88.8M
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10795
88.8M
    int mayshrink;
10796
88.8M
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10797
10798
88.8M
    if (slen < len1)
10799
36.2M
        goto nothing;
10800
10801
52.5M
    if (maxcount < 0)
10802
52.5M
        maxcount = PY_SSIZE_T_MAX;
10803
0
    else if (maxcount == 0)
10804
0
        goto nothing;
10805
10806
52.5M
    if (str1 == str2)
10807
0
        goto nothing;
10808
10809
52.5M
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10810
52.5M
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10811
52.5M
    if (maxchar < maxchar_str1)
10812
        /* substring too wide to be present */
10813
0
        goto nothing;
10814
52.5M
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10815
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10816
       result string. */
10817
52.5M
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10818
52.5M
    maxchar = Py_MAX(maxchar, maxchar_str2);
10819
10820
52.5M
    if (len1 == len2) {
10821
        /* same length */
10822
5.18M
        if (len1 == 0)
10823
0
            goto nothing;
10824
5.18M
        if (len1 == 1) {
10825
            /* replace characters */
10826
5.18M
            Py_UCS4 u1, u2;
10827
5.18M
            Py_ssize_t pos;
10828
10829
5.18M
            u1 = PyUnicode_READ(kind1, buf1, 0);
10830
5.18M
            pos = findchar(sbuf, skind, slen, u1, 1);
10831
5.18M
            if (pos < 0)
10832
4.12M
                goto nothing;
10833
1.05M
            u2 = PyUnicode_READ(kind2, buf2, 0);
10834
1.05M
            u = PyUnicode_New(slen, maxchar);
10835
1.05M
            if (!u)
10836
0
                goto error;
10837
10838
1.05M
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10839
1.05M
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10840
1.05M
        }
10841
0
        else {
10842
0
            int rkind = skind;
10843
0
            char *res;
10844
0
            Py_ssize_t i;
10845
10846
0
            if (kind1 < rkind) {
10847
                /* widen substring */
10848
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10849
0
                if (!buf1) goto error;
10850
0
                release1 = 1;
10851
0
            }
10852
0
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10853
0
            if (i < 0)
10854
0
                goto nothing;
10855
0
            if (rkind > kind2) {
10856
                /* widen replacement */
10857
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10858
0
                if (!buf2) goto error;
10859
0
                release2 = 1;
10860
0
            }
10861
0
            else if (rkind < kind2) {
10862
                /* widen self and buf1 */
10863
0
                rkind = kind2;
10864
0
                if (release1) {
10865
0
                    assert(buf1 != PyUnicode_DATA(str1));
10866
0
                    PyMem_Free((void *)buf1);
10867
0
                    buf1 = PyUnicode_DATA(str1);
10868
0
                    release1 = 0;
10869
0
                }
10870
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10871
0
                if (!sbuf) goto error;
10872
0
                srelease = 1;
10873
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10874
0
                if (!buf1) goto error;
10875
0
                release1 = 1;
10876
0
            }
10877
0
            u = PyUnicode_New(slen, maxchar);
10878
0
            if (!u)
10879
0
                goto error;
10880
0
            assert(PyUnicode_KIND(u) == rkind);
10881
0
            res = PyUnicode_DATA(u);
10882
10883
0
            memcpy(res, sbuf, rkind * slen);
10884
            /* change everything in-place, starting with this one */
10885
0
            memcpy(res + rkind * i,
10886
0
                   buf2,
10887
0
                   rkind * len2);
10888
0
            i += len1;
10889
10890
0
            while ( --maxcount > 0) {
10891
0
                i = anylib_find(rkind, self,
10892
0
                                sbuf+rkind*i, slen-i,
10893
0
                                str1, buf1, len1, i);
10894
0
                if (i == -1)
10895
0
                    break;
10896
0
                memcpy(res + rkind * i,
10897
0
                       buf2,
10898
0
                       rkind * len2);
10899
0
                i += len1;
10900
0
            }
10901
0
        }
10902
5.18M
    }
10903
47.3M
    else {
10904
47.3M
        Py_ssize_t n, i, j, ires;
10905
47.3M
        Py_ssize_t new_size;
10906
47.3M
        int rkind = skind;
10907
47.3M
        char *res;
10908
10909
47.3M
        if (kind1 < rkind) {
10910
            /* widen substring */
10911
4.36M
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10912
4.36M
            if (!buf1) goto error;
10913
4.36M
            release1 = 1;
10914
4.36M
        }
10915
47.3M
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10916
47.3M
        if (n == 0)
10917
41.6M
            goto nothing;
10918
5.74M
        if (kind2 < rkind) {
10919
            /* widen replacement */
10920
891k
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10921
891k
            if (!buf2) goto error;
10922
891k
            release2 = 1;
10923
891k
        }
10924
4.85M
        else if (kind2 > rkind) {
10925
            /* widen self and buf1 */
10926
0
            rkind = kind2;
10927
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10928
0
            if (!sbuf) goto error;
10929
0
            srelease = 1;
10930
0
            if (release1) {
10931
0
                assert(buf1 != PyUnicode_DATA(str1));
10932
0
                PyMem_Free((void *)buf1);
10933
0
                buf1 = PyUnicode_DATA(str1);
10934
0
                release1 = 0;
10935
0
            }
10936
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10937
0
            if (!buf1) goto error;
10938
0
            release1 = 1;
10939
0
        }
10940
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10941
           PyUnicode_GET_LENGTH(str1)); */
10942
5.74M
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10943
0
                PyErr_SetString(PyExc_OverflowError,
10944
0
                                "replace string is too long");
10945
0
                goto error;
10946
0
        }
10947
5.74M
        new_size = slen + n * (len2 - len1);
10948
5.74M
        if (new_size == 0) {
10949
0
            u = unicode_get_empty();
10950
0
            goto done;
10951
0
        }
10952
5.74M
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10953
0
            PyErr_SetString(PyExc_OverflowError,
10954
0
                            "replace string is too long");
10955
0
            goto error;
10956
0
        }
10957
5.74M
        u = PyUnicode_New(new_size, maxchar);
10958
5.74M
        if (!u)
10959
0
            goto error;
10960
5.74M
        assert(PyUnicode_KIND(u) == rkind);
10961
5.74M
        res = PyUnicode_DATA(u);
10962
5.74M
        ires = i = 0;
10963
5.74M
        if (len1 > 0) {
10964
157M
            while (n-- > 0) {
10965
                /* look for next match */
10966
152M
                j = anylib_find(rkind, self,
10967
152M
                                sbuf + rkind * i, slen-i,
10968
152M
                                str1, buf1, len1, i);
10969
152M
                if (j == -1)
10970
0
                    break;
10971
152M
                else if (j > i) {
10972
                    /* copy unchanged part [i:j] */
10973
22.6M
                    memcpy(res + rkind * ires,
10974
22.6M
                           sbuf + rkind * i,
10975
22.6M
                           rkind * (j-i));
10976
22.6M
                    ires += j - i;
10977
22.6M
                }
10978
                /* copy substitution string */
10979
152M
                if (len2 > 0) {
10980
152M
                    memcpy(res + rkind * ires,
10981
152M
                           buf2,
10982
152M
                           rkind * len2);
10983
152M
                    ires += len2;
10984
152M
                }
10985
152M
                i = j + len1;
10986
152M
            }
10987
5.74M
            if (i < slen)
10988
                /* copy tail [i:] */
10989
5.67M
                memcpy(res + rkind * ires,
10990
5.67M
                       sbuf + rkind * i,
10991
5.67M
                       rkind * (slen-i));
10992
5.74M
        }
10993
0
        else {
10994
            /* interleave */
10995
0
            while (n > 0) {
10996
0
                memcpy(res + rkind * ires,
10997
0
                       buf2,
10998
0
                       rkind * len2);
10999
0
                ires += len2;
11000
0
                if (--n <= 0)
11001
0
                    break;
11002
0
                memcpy(res + rkind * ires,
11003
0
                       sbuf + rkind * i,
11004
0
                       rkind);
11005
0
                ires++;
11006
0
                i++;
11007
0
            }
11008
0
            memcpy(res + rkind * ires,
11009
0
                   sbuf + rkind * i,
11010
0
                   rkind * (slen-i));
11011
0
        }
11012
5.74M
    }
11013
11014
6.79M
    if (mayshrink) {
11015
0
        unicode_adjust_maxchar(&u);
11016
0
        if (u == NULL)
11017
0
            goto error;
11018
0
    }
11019
11020
6.79M
  done:
11021
6.79M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
11022
6.79M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11023
6.79M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11024
6.79M
    if (srelease)
11025
0
        PyMem_Free((void *)sbuf);
11026
6.79M
    if (release1)
11027
891k
        PyMem_Free((void *)buf1);
11028
6.79M
    if (release2)
11029
891k
        PyMem_Free((void *)buf2);
11030
6.79M
    assert(_PyUnicode_CheckConsistency(u, 1));
11031
6.79M
    return u;
11032
11033
82.0M
  nothing:
11034
    /* nothing to replace; return original string (when possible) */
11035
82.0M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
11036
82.0M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11037
82.0M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11038
82.0M
    if (srelease)
11039
0
        PyMem_Free((void *)sbuf);
11040
82.0M
    if (release1)
11041
3.47M
        PyMem_Free((void *)buf1);
11042
82.0M
    if (release2)
11043
0
        PyMem_Free((void *)buf2);
11044
82.0M
    return unicode_result_unchanged(self);
11045
11046
0
  error:
11047
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
11048
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11049
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11050
0
    if (srelease)
11051
0
        PyMem_Free((void *)sbuf);
11052
0
    if (release1)
11053
0
        PyMem_Free((void *)buf1);
11054
0
    if (release2)
11055
0
        PyMem_Free((void *)buf2);
11056
0
    return NULL;
11057
6.79M
}
11058
11059
/* --- Unicode Object Methods --------------------------------------------- */
11060
11061
/*[clinic input]
11062
str.title as unicode_title
11063
11064
Return a version of the string where each word is titlecased.
11065
11066
More specifically, words start with uppercased characters and all remaining
11067
cased characters have lower case.
11068
[clinic start generated code]*/
11069
11070
static PyObject *
11071
unicode_title_impl(PyObject *self)
11072
/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
11073
0
{
11074
0
    return case_operation(self, do_title);
11075
0
}
11076
11077
/*[clinic input]
11078
str.capitalize as unicode_capitalize
11079
11080
Return a capitalized version of the string.
11081
11082
More specifically, make the first character have upper case and the rest lower
11083
case.
11084
[clinic start generated code]*/
11085
11086
static PyObject *
11087
unicode_capitalize_impl(PyObject *self)
11088
/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
11089
0
{
11090
0
    if (PyUnicode_GET_LENGTH(self) == 0)
11091
0
        return unicode_result_unchanged(self);
11092
0
    return case_operation(self, do_capitalize);
11093
0
}
11094
11095
/*[clinic input]
11096
str.casefold as unicode_casefold
11097
11098
Return a version of the string suitable for caseless comparisons.
11099
[clinic start generated code]*/
11100
11101
static PyObject *
11102
unicode_casefold_impl(PyObject *self)
11103
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
11104
0
{
11105
0
    if (PyUnicode_IS_ASCII(self))
11106
0
        return ascii_upper_or_lower(self, 1);
11107
0
    return case_operation(self, do_casefold);
11108
0
}
11109
11110
11111
/* Argument converter. Accepts a single Unicode character. */
11112
11113
static int
11114
convert_uc(PyObject *obj, void *addr)
11115
0
{
11116
0
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
11117
11118
0
    if (!PyUnicode_Check(obj)) {
11119
0
        PyErr_Format(PyExc_TypeError,
11120
0
                     "The fill character must be a unicode character, "
11121
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
11122
0
        return 0;
11123
0
    }
11124
0
    if (PyUnicode_GET_LENGTH(obj) != 1) {
11125
0
        PyErr_SetString(PyExc_TypeError,
11126
0
                        "The fill character must be exactly one character long");
11127
0
        return 0;
11128
0
    }
11129
0
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
11130
0
    return 1;
11131
0
}
11132
11133
/*[clinic input]
11134
str.center as unicode_center
11135
11136
    width: Py_ssize_t
11137
    fillchar: Py_UCS4 = ' '
11138
    /
11139
11140
Return a centered string of length width.
11141
11142
Padding is done using the specified fill character (default is a space).
11143
[clinic start generated code]*/
11144
11145
static PyObject *
11146
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11147
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
11148
0
{
11149
0
    Py_ssize_t marg, left;
11150
11151
0
    if (PyUnicode_GET_LENGTH(self) >= width)
11152
0
        return unicode_result_unchanged(self);
11153
11154
0
    marg = width - PyUnicode_GET_LENGTH(self);
11155
0
    left = marg / 2 + (marg & width & 1);
11156
11157
0
    return pad(self, left, marg - left, fillchar);
11158
0
}
11159
11160
/* This function assumes that str1 and str2 are readied by the caller. */
11161
11162
static int
11163
unicode_compare(PyObject *str1, PyObject *str2)
11164
26.6M
{
11165
26.6M
#define COMPARE(TYPE1, TYPE2) \
11166
26.6M
    do { \
11167
25.1M
        TYPE1* p1 = (TYPE1 *)data1; \
11168
25.1M
        TYPE2* p2 = (TYPE2 *)data2; \
11169
25.1M
        TYPE1* end = p1 + len; \
11170
25.1M
        Py_UCS4 c1, c2; \
11171
25.1M
        for (; p1 != end; p1++, p2++) { \
11172
25.1M
            c1 = *p1; \
11173
25.1M
            c2 = *p2; \
11174
25.1M
            if (c1 != c2) \
11175
25.1M
                return (c1 < c2) ? -1 : 1; \
11176
25.1M
        } \
11177
25.1M
    } \
11178
25.1M
    while (0)
11179
11180
26.6M
    int kind1, kind2;
11181
26.6M
    const void *data1, *data2;
11182
26.6M
    Py_ssize_t len1, len2, len;
11183
11184
26.6M
    kind1 = PyUnicode_KIND(str1);
11185
26.6M
    kind2 = PyUnicode_KIND(str2);
11186
26.6M
    data1 = PyUnicode_DATA(str1);
11187
26.6M
    data2 = PyUnicode_DATA(str2);
11188
26.6M
    len1 = PyUnicode_GET_LENGTH(str1);
11189
26.6M
    len2 = PyUnicode_GET_LENGTH(str2);
11190
26.6M
    len = Py_MIN(len1, len2);
11191
11192
26.6M
    switch(kind1) {
11193
1.64M
    case PyUnicode_1BYTE_KIND:
11194
1.64M
    {
11195
1.64M
        switch(kind2) {
11196
67.1k
        case PyUnicode_1BYTE_KIND:
11197
67.1k
        {
11198
67.1k
            int cmp = memcmp(data1, data2, len);
11199
            /* normalize result of memcmp() into the range [-1; 1] */
11200
67.1k
            if (cmp < 0)
11201
42.7k
                return -1;
11202
24.4k
            if (cmp > 0)
11203
23.9k
                return 1;
11204
508
            break;
11205
24.4k
        }
11206
1.27M
        case PyUnicode_2BYTE_KIND:
11207
1.27M
            COMPARE(Py_UCS1, Py_UCS2);
11208
0
            break;
11209
304k
        case PyUnicode_4BYTE_KIND:
11210
304k
            COMPARE(Py_UCS1, Py_UCS4);
11211
0
            break;
11212
0
        default:
11213
0
            Py_UNREACHABLE();
11214
1.64M
        }
11215
508
        break;
11216
1.64M
    }
11217
22.5M
    case PyUnicode_2BYTE_KIND:
11218
22.5M
    {
11219
22.5M
        switch(kind2) {
11220
3.46k
        case PyUnicode_1BYTE_KIND:
11221
3.46k
            COMPARE(Py_UCS2, Py_UCS1);
11222
0
            break;
11223
20.3M
        case PyUnicode_2BYTE_KIND:
11224
20.3M
        {
11225
20.3M
            COMPARE(Py_UCS2, Py_UCS2);
11226
0
            break;
11227
20.3M
        }
11228
2.20M
        case PyUnicode_4BYTE_KIND:
11229
2.20M
            COMPARE(Py_UCS2, Py_UCS4);
11230
0
            break;
11231
0
        default:
11232
0
            Py_UNREACHABLE();
11233
22.5M
        }
11234
0
        break;
11235
22.5M
    }
11236
2.47M
    case PyUnicode_4BYTE_KIND:
11237
2.47M
    {
11238
2.47M
        switch(kind2) {
11239
3.74k
        case PyUnicode_1BYTE_KIND:
11240
3.74k
            COMPARE(Py_UCS4, Py_UCS1);
11241
0
            break;
11242
958k
        case PyUnicode_2BYTE_KIND:
11243
958k
            COMPARE(Py_UCS4, Py_UCS2);
11244
0
            break;
11245
1.51M
        case PyUnicode_4BYTE_KIND:
11246
1.51M
        {
11247
1.51M
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11248
1.51M
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11249
            /* normalize result of wmemcmp() into the range [-1; 1] */
11250
1.51M
            if (cmp < 0)
11251
752k
                return -1;
11252
761k
            if (cmp > 0)
11253
761k
                return 1;
11254
#else
11255
            COMPARE(Py_UCS4, Py_UCS4);
11256
#endif
11257
0
            break;
11258
761k
        }
11259
0
        default:
11260
0
            Py_UNREACHABLE();
11261
2.47M
        }
11262
0
        break;
11263
2.47M
    }
11264
0
    default:
11265
0
        Py_UNREACHABLE();
11266
26.6M
    }
11267
11268
508
    if (len1 == len2)
11269
506
        return 0;
11270
2
    if (len1 < len2)
11271
2
        return -1;
11272
0
    else
11273
0
        return 1;
11274
11275
2
#undef COMPARE
11276
2
}
11277
11278
11279
int
11280
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11281
300M
{
11282
300M
    assert(PyUnicode_Check(str1));
11283
300M
    assert(PyUnicode_Check(str2));
11284
300M
    if (str1 == str2) {
11285
84.2M
        return 1;
11286
84.2M
    }
11287
216M
    return unicode_eq(str1, str2);
11288
300M
}
11289
11290
11291
int
11292
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11293
0
{
11294
0
    if (!PyUnicode_Check(str1)) {
11295
0
        PyErr_Format(PyExc_TypeError,
11296
0
                     "first argument must be str, not %T", str1);
11297
0
        return -1;
11298
0
    }
11299
0
    if (!PyUnicode_Check(str2)) {
11300
0
        PyErr_Format(PyExc_TypeError,
11301
0
                     "second argument must be str, not %T", str2);
11302
0
        return -1;
11303
0
    }
11304
11305
0
    return _PyUnicode_Equal(str1, str2);
11306
0
}
11307
11308
11309
int
11310
PyUnicode_Compare(PyObject *left, PyObject *right)
11311
6.22k
{
11312
6.22k
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11313
        /* a string is equal to itself */
11314
6.22k
        if (left == right)
11315
0
            return 0;
11316
11317
6.22k
        return unicode_compare(left, right);
11318
6.22k
    }
11319
0
    PyErr_Format(PyExc_TypeError,
11320
0
                 "Can't compare %.100s and %.100s",
11321
0
                 Py_TYPE(left)->tp_name,
11322
0
                 Py_TYPE(right)->tp_name);
11323
0
    return -1;
11324
6.22k
}
11325
11326
int
11327
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11328
2.02M
{
11329
2.02M
    Py_ssize_t i;
11330
2.02M
    int kind;
11331
2.02M
    Py_UCS4 chr;
11332
11333
2.02M
    assert(_PyUnicode_CHECK(uni));
11334
2.02M
    kind = PyUnicode_KIND(uni);
11335
2.02M
    if (kind == PyUnicode_1BYTE_KIND) {
11336
2.02M
        const void *data = PyUnicode_1BYTE_DATA(uni);
11337
2.02M
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11338
2.02M
        size_t len, len2 = strlen(str);
11339
2.02M
        int cmp;
11340
11341
2.02M
        len = Py_MIN(len1, len2);
11342
2.02M
        cmp = memcmp(data, str, len);
11343
2.02M
        if (cmp != 0) {
11344
1.43M
            if (cmp < 0)
11345
7.03k
                return -1;
11346
1.42M
            else
11347
1.42M
                return 1;
11348
1.43M
        }
11349
592k
        if (len1 > len2)
11350
70
            return 1; /* uni is longer */
11351
592k
        if (len1 < len2)
11352
804
            return -1; /* str is longer */
11353
591k
        return 0;
11354
592k
    }
11355
1.34k
    else {
11356
1.34k
        const void *data = PyUnicode_DATA(uni);
11357
        /* Compare Unicode string and source character set string */
11358
2.67k
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11359
2.43k
            if (chr != (unsigned char)str[i])
11360
1.09k
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11361
        /* This check keeps Python strings that end in '\0' from comparing equal
11362
         to C strings identical up to that point. */
11363
244
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11364
244
            return 1; /* uni is longer */
11365
0
        if (str[i])
11366
0
            return -1; /* str is longer */
11367
0
        return 0;
11368
0
    }
11369
2.02M
}
11370
11371
int
11372
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11373
0
{
11374
0
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11375
0
}
11376
11377
int
11378
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11379
0
{
11380
0
    assert(_PyUnicode_CHECK(unicode));
11381
0
    assert(str);
11382
11383
0
    if (PyUnicode_IS_ASCII(unicode)) {
11384
0
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11385
0
        return size == len &&
11386
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11387
0
    }
11388
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11389
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11390
0
        return size == len &&
11391
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11392
0
    }
11393
11394
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11395
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11396
0
        return 0;
11397
0
    }
11398
0
    const unsigned char *s = (const unsigned char *)str;
11399
0
    const unsigned char *ends = s + (size_t)size;
11400
0
    int kind = PyUnicode_KIND(unicode);
11401
0
    const void *data = PyUnicode_DATA(unicode);
11402
    /* Compare Unicode string and UTF-8 string */
11403
0
    for (Py_ssize_t i = 0; i < len; i++) {
11404
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11405
0
        if (ch < 0x80) {
11406
0
            if (ends == s || s[0] != ch) {
11407
0
                return 0;
11408
0
            }
11409
0
            s += 1;
11410
0
        }
11411
0
        else if (ch < 0x800) {
11412
0
            if ((ends - s) < 2 ||
11413
0
                s[0] != (0xc0 | (ch >> 6)) ||
11414
0
                s[1] != (0x80 | (ch & 0x3f)))
11415
0
            {
11416
0
                return 0;
11417
0
            }
11418
0
            s += 2;
11419
0
        }
11420
0
        else if (ch < 0x10000) {
11421
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11422
0
                (ends - s) < 3 ||
11423
0
                s[0] != (0xe0 | (ch >> 12)) ||
11424
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11425
0
                s[2] != (0x80 | (ch & 0x3f)))
11426
0
            {
11427
0
                return 0;
11428
0
            }
11429
0
            s += 3;
11430
0
        }
11431
0
        else {
11432
0
            assert(ch <= MAX_UNICODE);
11433
0
            if ((ends - s) < 4 ||
11434
0
                s[0] != (0xf0 | (ch >> 18)) ||
11435
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11436
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11437
0
                s[3] != (0x80 | (ch & 0x3f)))
11438
0
            {
11439
0
                return 0;
11440
0
            }
11441
0
            s += 4;
11442
0
        }
11443
0
    }
11444
0
    return s == ends;
11445
0
}
11446
11447
int
11448
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11449
6.90M
{
11450
6.90M
    size_t len;
11451
6.90M
    assert(_PyUnicode_CHECK(unicode));
11452
6.90M
    assert(str);
11453
#ifndef NDEBUG
11454
    for (const char *p = str; *p; p++) {
11455
        assert((unsigned char)*p < 128);
11456
    }
11457
#endif
11458
6.90M
    if (!PyUnicode_IS_ASCII(unicode))
11459
151k
        return 0;
11460
6.75M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11461
6.75M
    return strlen(str) == len &&
11462
6.75M
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11463
6.90M
}
11464
11465
int
11466
_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11467
0
{
11468
0
    PyObject *right_uni;
11469
11470
0
    assert(_PyUnicode_CHECK(left));
11471
0
    assert(right->string);
11472
#ifndef NDEBUG
11473
    for (const char *p = right->string; *p; p++) {
11474
        assert((unsigned char)*p < 128);
11475
    }
11476
#endif
11477
11478
0
    if (!PyUnicode_IS_ASCII(left))
11479
0
        return 0;
11480
11481
0
    right_uni = _PyUnicode_FromId(right);       /* borrowed */
11482
0
    if (right_uni == NULL) {
11483
        /* memory error or bad data */
11484
0
        PyErr_Clear();
11485
0
        return _PyUnicode_EqualToASCIIString(left, right->string);
11486
0
    }
11487
11488
0
    if (left == right_uni)
11489
0
        return 1;
11490
11491
0
    assert(PyUnicode_CHECK_INTERNED(right_uni));
11492
0
    if (PyUnicode_CHECK_INTERNED(left)) {
11493
0
        return 0;
11494
0
    }
11495
11496
0
    Py_hash_t right_hash = PyUnicode_HASH(right_uni);
11497
0
    assert(right_hash != -1);
11498
0
    Py_hash_t hash = PyUnicode_HASH(left);
11499
0
    if (hash != -1 && hash != right_hash) {
11500
0
        return 0;
11501
0
    }
11502
11503
0
    return unicode_eq(left, right_uni);
11504
0
}
11505
11506
PyObject *
11507
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11508
33.1M
{
11509
33.1M
    int result;
11510
11511
33.1M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11512
88.1k
        Py_RETURN_NOTIMPLEMENTED;
11513
11514
33.1M
    if (left == right) {
11515
1.38k
        switch (op) {
11516
1.31k
        case Py_EQ:
11517
1.31k
        case Py_LE:
11518
1.31k
        case Py_GE:
11519
            /* a string is equal to itself */
11520
1.31k
            Py_RETURN_TRUE;
11521
67
        case Py_NE:
11522
67
        case Py_LT:
11523
67
        case Py_GT:
11524
67
            Py_RETURN_FALSE;
11525
0
        default:
11526
0
            PyErr_BadArgument();
11527
0
            return NULL;
11528
1.38k
        }
11529
1.38k
    }
11530
33.1M
    else if (op == Py_EQ || op == Py_NE) {
11531
6.42M
        result = unicode_eq(left, right);
11532
6.42M
        result ^= (op == Py_NE);
11533
6.42M
        return PyBool_FromLong(result);
11534
6.42M
    }
11535
26.6M
    else {
11536
26.6M
        result = unicode_compare(left, right);
11537
26.6M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11538
26.6M
    }
11539
33.1M
}
11540
11541
int
11542
PyUnicode_Contains(PyObject *str, PyObject *substr)
11543
101M
{
11544
101M
    int kind1, kind2;
11545
101M
    const void *buf1, *buf2;
11546
101M
    Py_ssize_t len1, len2;
11547
101M
    int result;
11548
11549
101M
    if (!PyUnicode_Check(substr)) {
11550
0
        PyErr_Format(PyExc_TypeError,
11551
0
                     "'in <string>' requires string as left operand, not %.100s",
11552
0
                     Py_TYPE(substr)->tp_name);
11553
0
        return -1;
11554
0
    }
11555
101M
    if (ensure_unicode(str) < 0)
11556
0
        return -1;
11557
11558
101M
    kind1 = PyUnicode_KIND(str);
11559
101M
    kind2 = PyUnicode_KIND(substr);
11560
101M
    if (kind1 < kind2)
11561
4.24M
        return 0;
11562
97.6M
    len1 = PyUnicode_GET_LENGTH(str);
11563
97.6M
    len2 = PyUnicode_GET_LENGTH(substr);
11564
97.6M
    if (len1 < len2)
11565
6.60M
        return 0;
11566
91.0M
    buf1 = PyUnicode_DATA(str);
11567
91.0M
    buf2 = PyUnicode_DATA(substr);
11568
91.0M
    if (len2 == 1) {
11569
90.9M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11570
90.9M
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11571
90.9M
        return result;
11572
90.9M
    }
11573
33.1k
    if (kind2 != kind1) {
11574
17.8k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11575
17.8k
        if (!buf2)
11576
0
            return -1;
11577
17.8k
    }
11578
11579
33.1k
    switch (kind1) {
11580
15.3k
    case PyUnicode_1BYTE_KIND:
11581
15.3k
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11582
15.3k
        break;
11583
13.3k
    case PyUnicode_2BYTE_KIND:
11584
13.3k
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11585
13.3k
        break;
11586
4.45k
    case PyUnicode_4BYTE_KIND:
11587
4.45k
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11588
4.45k
        break;
11589
0
    default:
11590
0
        Py_UNREACHABLE();
11591
33.1k
    }
11592
11593
33.1k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11594
33.1k
    if (kind2 != kind1)
11595
17.8k
        PyMem_Free((void *)buf2);
11596
11597
33.1k
    return result;
11598
33.1k
}
11599
11600
/* Concat to string or Unicode object giving a new Unicode object. */
11601
11602
PyObject *
11603
PyUnicode_Concat(PyObject *left, PyObject *right)
11604
38.3M
{
11605
38.3M
    PyObject *result;
11606
38.3M
    Py_UCS4 maxchar, maxchar2;
11607
38.3M
    Py_ssize_t left_len, right_len, new_len;
11608
11609
38.3M
    if (ensure_unicode(left) < 0)
11610
0
        return NULL;
11611
11612
38.3M
    if (!PyUnicode_Check(right)) {
11613
0
        if (_PyTemplate_CheckExact(right)) {
11614
            // str + tstring is implemented in the tstring type
11615
0
            return _PyTemplate_Concat(left, right);
11616
0
        }
11617
0
        else {
11618
0
            PyErr_Format(PyExc_TypeError,
11619
0
                "can only concatenate str (not \"%.200s\") to str",
11620
0
                Py_TYPE(right)->tp_name);
11621
0
            return NULL;
11622
0
        }
11623
0
    }
11624
11625
    /* Shortcuts */
11626
38.3M
    PyObject *empty = unicode_get_empty();  // Borrowed reference
11627
38.3M
    if (left == empty) {
11628
90.2k
        return PyUnicode_FromObject(right);
11629
90.2k
    }
11630
38.2M
    if (right == empty) {
11631
10.2M
        return PyUnicode_FromObject(left);
11632
10.2M
    }
11633
11634
27.9M
    left_len = PyUnicode_GET_LENGTH(left);
11635
27.9M
    right_len = PyUnicode_GET_LENGTH(right);
11636
27.9M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11637
0
        PyErr_SetString(PyExc_OverflowError,
11638
0
                        "strings are too large to concat");
11639
0
        return NULL;
11640
0
    }
11641
27.9M
    new_len = left_len + right_len;
11642
11643
27.9M
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11644
27.9M
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11645
27.9M
    maxchar = Py_MAX(maxchar, maxchar2);
11646
11647
    /* Concat the two Unicode strings */
11648
27.9M
    result = PyUnicode_New(new_len, maxchar);
11649
27.9M
    if (result == NULL)
11650
0
        return NULL;
11651
27.9M
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11652
27.9M
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11653
27.9M
    assert(_PyUnicode_CheckConsistency(result, 1));
11654
27.9M
    return result;
11655
27.9M
}
11656
11657
void
11658
PyUnicode_Append(PyObject **p_left, PyObject *right)
11659
1.06M
{
11660
1.06M
    PyObject *left, *res;
11661
1.06M
    Py_UCS4 maxchar, maxchar2;
11662
1.06M
    Py_ssize_t left_len, right_len, new_len;
11663
11664
1.06M
    if (p_left == NULL) {
11665
0
        if (!PyErr_Occurred())
11666
0
            PyErr_BadInternalCall();
11667
0
        return;
11668
0
    }
11669
1.06M
    left = *p_left;
11670
1.06M
    if (right == NULL || left == NULL
11671
1.06M
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11672
0
        if (!PyErr_Occurred())
11673
0
            PyErr_BadInternalCall();
11674
0
        goto error;
11675
0
    }
11676
11677
    /* Shortcuts */
11678
1.06M
    PyObject *empty = unicode_get_empty();  // Borrowed reference
11679
1.06M
    if (left == empty) {
11680
378k
        Py_DECREF(left);
11681
378k
        *p_left = Py_NewRef(right);
11682
378k
        return;
11683
378k
    }
11684
681k
    if (right == empty) {
11685
0
        return;
11686
0
    }
11687
11688
681k
    left_len = PyUnicode_GET_LENGTH(left);
11689
681k
    right_len = PyUnicode_GET_LENGTH(right);
11690
681k
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11691
0
        PyErr_SetString(PyExc_OverflowError,
11692
0
                        "strings are too large to concat");
11693
0
        goto error;
11694
0
    }
11695
681k
    new_len = left_len + right_len;
11696
11697
681k
    if (unicode_modifiable(left)
11698
681k
        && PyUnicode_CheckExact(right)
11699
681k
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11700
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11701
           to change the structure size, but characters are stored just after
11702
           the structure, and so it requires to move all characters which is
11703
           not so different than duplicating the string. */
11704
681k
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11705
631k
    {
11706
        /* append inplace */
11707
631k
        if (unicode_resize(p_left, new_len) != 0)
11708
0
            goto error;
11709
11710
        /* copy 'right' into the newly allocated area of 'left' */
11711
631k
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11712
631k
    }
11713
50.7k
    else {
11714
50.7k
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11715
50.7k
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11716
50.7k
        maxchar = Py_MAX(maxchar, maxchar2);
11717
11718
        /* Concat the two Unicode strings */
11719
50.7k
        res = PyUnicode_New(new_len, maxchar);
11720
50.7k
        if (res == NULL)
11721
0
            goto error;
11722
50.7k
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11723
50.7k
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11724
50.7k
        Py_DECREF(left);
11725
50.7k
        *p_left = res;
11726
50.7k
    }
11727
681k
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11728
681k
    return;
11729
11730
0
error:
11731
0
    Py_CLEAR(*p_left);
11732
0
}
11733
11734
void
11735
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11736
0
{
11737
0
    PyUnicode_Append(pleft, right);
11738
0
    Py_XDECREF(right);
11739
0
}
11740
11741
/*[clinic input]
11742
@text_signature "($self, sub[, start[, end]], /)"
11743
str.count as unicode_count -> Py_ssize_t
11744
11745
    self as str: self
11746
    sub as substr: unicode
11747
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11748
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11749
    /
11750
11751
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11752
11753
Optional arguments start and end are interpreted as in slice notation.
11754
[clinic start generated code]*/
11755
11756
static Py_ssize_t
11757
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11758
                   Py_ssize_t end)
11759
/*[clinic end generated code: output=8fcc3aef0b18edbf input=6f168ffd94be8785]*/
11760
19.0M
{
11761
19.0M
    assert(PyUnicode_Check(str));
11762
19.0M
    assert(PyUnicode_Check(substr));
11763
11764
19.0M
    Py_ssize_t result;
11765
19.0M
    int kind1, kind2;
11766
19.0M
    const void *buf1 = NULL, *buf2 = NULL;
11767
19.0M
    Py_ssize_t len1, len2;
11768
11769
19.0M
    kind1 = PyUnicode_KIND(str);
11770
19.0M
    kind2 = PyUnicode_KIND(substr);
11771
19.0M
    if (kind1 < kind2)
11772
0
        return 0;
11773
11774
19.0M
    len1 = PyUnicode_GET_LENGTH(str);
11775
19.0M
    len2 = PyUnicode_GET_LENGTH(substr);
11776
19.0M
    ADJUST_INDICES(start, end, len1);
11777
19.0M
    if (end - start < len2)
11778
74.7k
        return 0;
11779
11780
19.0M
    buf1 = PyUnicode_DATA(str);
11781
19.0M
    buf2 = PyUnicode_DATA(substr);
11782
19.0M
    if (kind2 != kind1) {
11783
3.52M
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11784
3.52M
        if (!buf2)
11785
0
            goto onError;
11786
3.52M
    }
11787
11788
    // We don't reuse `anylib_count` here because of the explicit casts.
11789
19.0M
    switch (kind1) {
11790
15.4M
    case PyUnicode_1BYTE_KIND:
11791
15.4M
        result = ucs1lib_count(
11792
15.4M
            ((const Py_UCS1*)buf1) + start, end - start,
11793
15.4M
            buf2, len2, PY_SSIZE_T_MAX
11794
15.4M
            );
11795
15.4M
        break;
11796
2.76M
    case PyUnicode_2BYTE_KIND:
11797
2.76M
        result = ucs2lib_count(
11798
2.76M
            ((const Py_UCS2*)buf1) + start, end - start,
11799
2.76M
            buf2, len2, PY_SSIZE_T_MAX
11800
2.76M
            );
11801
2.76M
        break;
11802
761k
    case PyUnicode_4BYTE_KIND:
11803
761k
        result = ucs4lib_count(
11804
761k
            ((const Py_UCS4*)buf1) + start, end - start,
11805
761k
            buf2, len2, PY_SSIZE_T_MAX
11806
761k
            );
11807
761k
        break;
11808
0
    default:
11809
0
        Py_UNREACHABLE();
11810
19.0M
    }
11811
11812
19.0M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11813
19.0M
    if (kind2 != kind1)
11814
3.52M
        PyMem_Free((void *)buf2);
11815
11816
19.0M
    return result;
11817
0
  onError:
11818
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11819
0
    if (kind2 != kind1)
11820
0
        PyMem_Free((void *)buf2);
11821
0
    return -1;
11822
19.0M
}
11823
11824
/*[clinic input]
11825
str.encode as unicode_encode
11826
11827
    encoding: str(c_default="NULL") = 'utf-8'
11828
        The encoding in which to encode the string.
11829
    errors: str(c_default="NULL") = 'strict'
11830
        The error handling scheme to use for encoding errors.
11831
        The default is 'strict' meaning that encoding errors raise a
11832
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11833
        'xmlcharrefreplace' as well as any other name registered with
11834
        codecs.register_error that can handle UnicodeEncodeErrors.
11835
11836
Encode the string using the codec registered for encoding.
11837
[clinic start generated code]*/
11838
11839
static PyObject *
11840
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11841
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11842
15.8M
{
11843
15.8M
    return PyUnicode_AsEncodedString(self, encoding, errors);
11844
15.8M
}
11845
11846
/*[clinic input]
11847
str.expandtabs as unicode_expandtabs
11848
11849
    tabsize: int = 8
11850
11851
Return a copy where all tab characters are expanded using spaces.
11852
11853
If tabsize is not given, a tab size of 8 characters is assumed.
11854
[clinic start generated code]*/
11855
11856
static PyObject *
11857
unicode_expandtabs_impl(PyObject *self, int tabsize)
11858
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11859
7.36M
{
11860
7.36M
    Py_ssize_t i, j, line_pos, src_len, incr;
11861
7.36M
    Py_UCS4 ch;
11862
7.36M
    PyObject *u;
11863
7.36M
    const void *src_data;
11864
7.36M
    void *dest_data;
11865
7.36M
    int kind;
11866
7.36M
    int found;
11867
11868
    /* First pass: determine size of output string */
11869
7.36M
    src_len = PyUnicode_GET_LENGTH(self);
11870
7.36M
    i = j = line_pos = 0;
11871
7.36M
    kind = PyUnicode_KIND(self);
11872
7.36M
    src_data = PyUnicode_DATA(self);
11873
7.36M
    found = 0;
11874
119M
    for (; i < src_len; i++) {
11875
112M
        ch = PyUnicode_READ(kind, src_data, i);
11876
112M
        if (ch == '\t') {
11877
11.9M
            found = 1;
11878
11.9M
            if (tabsize > 0) {
11879
11.9M
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11880
11.9M
                if (j > PY_SSIZE_T_MAX - incr)
11881
0
                    goto overflow;
11882
11.9M
                line_pos += incr;
11883
11.9M
                j += incr;
11884
11.9M
            }
11885
11.9M
        }
11886
100M
        else {
11887
100M
            if (j > PY_SSIZE_T_MAX - 1)
11888
0
                goto overflow;
11889
100M
            line_pos++;
11890
100M
            j++;
11891
100M
            if (ch == '\n' || ch == '\r')
11892
11.4k
                line_pos = 0;
11893
100M
        }
11894
112M
    }
11895
7.36M
    if (!found)
11896
7.21M
        return unicode_result_unchanged(self);
11897
11898
    /* Second pass: create output string and fill it */
11899
149k
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11900
149k
    if (!u)
11901
0
        return NULL;
11902
149k
    dest_data = PyUnicode_DATA(u);
11903
11904
149k
    i = j = line_pos = 0;
11905
11906
26.2M
    for (; i < src_len; i++) {
11907
26.0M
        ch = PyUnicode_READ(kind, src_data, i);
11908
26.0M
        if (ch == '\t') {
11909
11.9M
            if (tabsize > 0) {
11910
11.9M
                incr = tabsize - (line_pos % tabsize);
11911
11.9M
                line_pos += incr;
11912
11.9M
                unicode_fill(kind, dest_data, ' ', j, incr);
11913
11.9M
                j += incr;
11914
11.9M
            }
11915
11.9M
        }
11916
14.0M
        else {
11917
14.0M
            line_pos++;
11918
14.0M
            PyUnicode_WRITE(kind, dest_data, j, ch);
11919
14.0M
            j++;
11920
14.0M
            if (ch == '\n' || ch == '\r')
11921
0
                line_pos = 0;
11922
14.0M
        }
11923
26.0M
    }
11924
149k
    assert (j == PyUnicode_GET_LENGTH(u));
11925
149k
    return unicode_result(u);
11926
11927
0
  overflow:
11928
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11929
0
    return NULL;
11930
149k
}
11931
11932
/*[clinic input]
11933
str.find as unicode_find = str.count
11934
11935
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11936
11937
Optional arguments start and end are interpreted as in slice notation.
11938
Return -1 on failure.
11939
[clinic start generated code]*/
11940
11941
static Py_ssize_t
11942
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11943
                  Py_ssize_t end)
11944
/*[clinic end generated code: output=51dbe6255712e278 input=4a89d2d68ef57256]*/
11945
15.5M
{
11946
15.5M
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11947
15.5M
    if (result < 0) {
11948
237k
        return -1;
11949
237k
    }
11950
15.2M
    return result;
11951
15.5M
}
11952
11953
static PyObject *
11954
unicode_getitem(PyObject *self, Py_ssize_t index)
11955
58.5M
{
11956
58.5M
    const void *data;
11957
58.5M
    int kind;
11958
58.5M
    Py_UCS4 ch;
11959
11960
58.5M
    if (!PyUnicode_Check(self)) {
11961
0
        PyErr_BadArgument();
11962
0
        return NULL;
11963
0
    }
11964
58.5M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11965
466
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11966
466
        return NULL;
11967
466
    }
11968
58.5M
    kind = PyUnicode_KIND(self);
11969
58.5M
    data = PyUnicode_DATA(self);
11970
58.5M
    ch = PyUnicode_READ(kind, data, index);
11971
58.5M
    return unicode_char(ch);
11972
58.5M
}
11973
11974
/* Believe it or not, this produces the same value for ASCII strings
11975
   as bytes_hash(). */
11976
static Py_hash_t
11977
unicode_hash(PyObject *self)
11978
42.2M
{
11979
42.2M
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11980
11981
#ifdef Py_DEBUG
11982
    assert(_Py_HashSecret_Initialized);
11983
#endif
11984
42.2M
    Py_hash_t hash = PyUnicode_HASH(self);
11985
42.2M
    if (hash != -1) {
11986
208k
        return hash;
11987
208k
    }
11988
42.0M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11989
42.0M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11990
11991
42.0M
    PyUnicode_SET_HASH(self, x);
11992
42.0M
    return x;
11993
42.2M
}
11994
11995
/*[clinic input]
11996
str.index as unicode_index = str.count
11997
11998
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11999
12000
Optional arguments start and end are interpreted as in slice notation.
12001
Raises ValueError when the substring is not found.
12002
[clinic start generated code]*/
12003
12004
static Py_ssize_t
12005
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12006
                   Py_ssize_t end)
12007
/*[clinic end generated code: output=77558288837cdf40 input=d986aeac0be14a1c]*/
12008
564k
{
12009
564k
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
12010
564k
    if (result == -1) {
12011
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12012
0
    }
12013
564k
    else if (result < 0) {
12014
0
        return -1;
12015
0
    }
12016
564k
    return result;
12017
564k
}
12018
12019
/*[clinic input]
12020
str.isascii as unicode_isascii
12021
12022
Return True if all characters in the string are ASCII, False otherwise.
12023
12024
ASCII characters have code points in the range U+0000-U+007F.
12025
Empty string is ASCII too.
12026
[clinic start generated code]*/
12027
12028
static PyObject *
12029
unicode_isascii_impl(PyObject *self)
12030
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12031
13.0M
{
12032
13.0M
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12033
13.0M
}
12034
12035
/*[clinic input]
12036
str.islower as unicode_islower
12037
12038
Return True if the string is a lowercase string, False otherwise.
12039
12040
A string is lowercase if all cased characters in the string are lowercase and
12041
there is at least one cased character in the string.
12042
[clinic start generated code]*/
12043
12044
static PyObject *
12045
unicode_islower_impl(PyObject *self)
12046
/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
12047
0
{
12048
0
    Py_ssize_t i, length;
12049
0
    int kind;
12050
0
    const void *data;
12051
0
    int cased;
12052
12053
0
    length = PyUnicode_GET_LENGTH(self);
12054
0
    kind = PyUnicode_KIND(self);
12055
0
    data = PyUnicode_DATA(self);
12056
12057
    /* Shortcut for single character strings */
12058
0
    if (length == 1)
12059
0
        return PyBool_FromLong(
12060
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
12061
12062
    /* Special case for empty strings */
12063
0
    if (length == 0)
12064
0
        Py_RETURN_FALSE;
12065
12066
0
    cased = 0;
12067
0
    for (i = 0; i < length; i++) {
12068
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12069
12070
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
12071
0
            Py_RETURN_FALSE;
12072
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
12073
0
            cased = 1;
12074
0
    }
12075
0
    return PyBool_FromLong(cased);
12076
0
}
12077
12078
/*[clinic input]
12079
str.isupper as unicode_isupper
12080
12081
Return True if the string is an uppercase string, False otherwise.
12082
12083
A string is uppercase if all cased characters in the string are uppercase and
12084
there is at least one cased character in the string.
12085
[clinic start generated code]*/
12086
12087
static PyObject *
12088
unicode_isupper_impl(PyObject *self)
12089
/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
12090
6.98k
{
12091
6.98k
    Py_ssize_t i, length;
12092
6.98k
    int kind;
12093
6.98k
    const void *data;
12094
6.98k
    int cased;
12095
12096
6.98k
    length = PyUnicode_GET_LENGTH(self);
12097
6.98k
    kind = PyUnicode_KIND(self);
12098
6.98k
    data = PyUnicode_DATA(self);
12099
12100
    /* Shortcut for single character strings */
12101
6.98k
    if (length == 1)
12102
0
        return PyBool_FromLong(
12103
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
12104
12105
    /* Special case for empty strings */
12106
6.98k
    if (length == 0)
12107
0
        Py_RETURN_FALSE;
12108
12109
6.98k
    cased = 0;
12110
89.1k
    for (i = 0; i < length; i++) {
12111
83.0k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12112
12113
83.0k
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
12114
840
            Py_RETURN_FALSE;
12115
82.1k
        else if (!cased && Py_UNICODE_ISUPPER(ch))
12116
6.23k
            cased = 1;
12117
83.0k
    }
12118
6.14k
    return PyBool_FromLong(cased);
12119
6.98k
}
12120
12121
/*[clinic input]
12122
str.istitle as unicode_istitle
12123
12124
Return True if the string is a title-cased string, False otherwise.
12125
12126
In a title-cased string, upper- and title-case characters may only
12127
follow uncased characters and lowercase characters only cased ones.
12128
[clinic start generated code]*/
12129
12130
static PyObject *
12131
unicode_istitle_impl(PyObject *self)
12132
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
12133
0
{
12134
0
    Py_ssize_t i, length;
12135
0
    int kind;
12136
0
    const void *data;
12137
0
    int cased, previous_is_cased;
12138
12139
0
    length = PyUnicode_GET_LENGTH(self);
12140
0
    kind = PyUnicode_KIND(self);
12141
0
    data = PyUnicode_DATA(self);
12142
12143
    /* Shortcut for single character strings */
12144
0
    if (length == 1) {
12145
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12146
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12147
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
12148
0
    }
12149
12150
    /* Special case for empty strings */
12151
0
    if (length == 0)
12152
0
        Py_RETURN_FALSE;
12153
12154
0
    cased = 0;
12155
0
    previous_is_cased = 0;
12156
0
    for (i = 0; i < length; i++) {
12157
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12158
12159
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12160
0
            if (previous_is_cased)
12161
0
                Py_RETURN_FALSE;
12162
0
            previous_is_cased = 1;
12163
0
            cased = 1;
12164
0
        }
12165
0
        else if (Py_UNICODE_ISLOWER(ch)) {
12166
0
            if (!previous_is_cased)
12167
0
                Py_RETURN_FALSE;
12168
0
            previous_is_cased = 1;
12169
0
            cased = 1;
12170
0
        }
12171
0
        else
12172
0
            previous_is_cased = 0;
12173
0
    }
12174
0
    return PyBool_FromLong(cased);
12175
0
}
12176
12177
/*[clinic input]
12178
str.isspace as unicode_isspace
12179
12180
Return True if the string is a whitespace string, False otherwise.
12181
12182
A string is whitespace if all characters in the string are whitespace and there
12183
is at least one character in the string.
12184
[clinic start generated code]*/
12185
12186
static PyObject *
12187
unicode_isspace_impl(PyObject *self)
12188
/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
12189
20.4M
{
12190
20.4M
    Py_ssize_t i, length;
12191
20.4M
    int kind;
12192
20.4M
    const void *data;
12193
12194
20.4M
    length = PyUnicode_GET_LENGTH(self);
12195
20.4M
    kind = PyUnicode_KIND(self);
12196
20.4M
    data = PyUnicode_DATA(self);
12197
12198
    /* Shortcut for single character strings */
12199
20.4M
    if (length == 1)
12200
20.4M
        return PyBool_FromLong(
12201
20.4M
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
12202
12203
    /* Special case for empty strings */
12204
0
    if (length == 0)
12205
0
        Py_RETURN_FALSE;
12206
12207
0
    for (i = 0; i < length; i++) {
12208
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12209
0
        if (!Py_UNICODE_ISSPACE(ch))
12210
0
            Py_RETURN_FALSE;
12211
0
    }
12212
0
    Py_RETURN_TRUE;
12213
0
}
12214
12215
/*[clinic input]
12216
str.isalpha as unicode_isalpha
12217
12218
Return True if the string is an alphabetic string, False otherwise.
12219
12220
A string is alphabetic if all characters in the string are alphabetic and there
12221
is at least one character in the string.
12222
[clinic start generated code]*/
12223
12224
static PyObject *
12225
unicode_isalpha_impl(PyObject *self)
12226
/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
12227
0
{
12228
0
    Py_ssize_t i, length;
12229
0
    int kind;
12230
0
    const void *data;
12231
12232
0
    length = PyUnicode_GET_LENGTH(self);
12233
0
    kind = PyUnicode_KIND(self);
12234
0
    data = PyUnicode_DATA(self);
12235
12236
    /* Shortcut for single character strings */
12237
0
    if (length == 1)
12238
0
        return PyBool_FromLong(
12239
0
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
12240
12241
    /* Special case for empty strings */
12242
0
    if (length == 0)
12243
0
        Py_RETURN_FALSE;
12244
12245
0
    for (i = 0; i < length; i++) {
12246
0
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12247
0
            Py_RETURN_FALSE;
12248
0
    }
12249
0
    Py_RETURN_TRUE;
12250
0
}
12251
12252
/*[clinic input]
12253
str.isalnum as unicode_isalnum
12254
12255
Return True if the string is an alpha-numeric string, False otherwise.
12256
12257
A string is alpha-numeric if all characters in the string are alpha-numeric and
12258
there is at least one character in the string.
12259
[clinic start generated code]*/
12260
12261
static PyObject *
12262
unicode_isalnum_impl(PyObject *self)
12263
/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
12264
13.1M
{
12265
13.1M
    int kind;
12266
13.1M
    const void *data;
12267
13.1M
    Py_ssize_t len, i;
12268
12269
13.1M
    kind = PyUnicode_KIND(self);
12270
13.1M
    data = PyUnicode_DATA(self);
12271
13.1M
    len = PyUnicode_GET_LENGTH(self);
12272
12273
    /* Shortcut for single character strings */
12274
13.1M
    if (len == 1) {
12275
13.1M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12276
13.1M
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12277
13.1M
    }
12278
12279
    /* Special case for empty strings */
12280
0
    if (len == 0)
12281
0
        Py_RETURN_FALSE;
12282
12283
0
    for (i = 0; i < len; i++) {
12284
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12285
0
        if (!Py_UNICODE_ISALNUM(ch))
12286
0
            Py_RETURN_FALSE;
12287
0
    }
12288
0
    Py_RETURN_TRUE;
12289
0
}
12290
12291
/*[clinic input]
12292
str.isdecimal as unicode_isdecimal
12293
12294
Return True if the string is a decimal string, False otherwise.
12295
12296
A string is a decimal string if all characters in the string are decimal and
12297
there is at least one character in the string.
12298
[clinic start generated code]*/
12299
12300
static PyObject *
12301
unicode_isdecimal_impl(PyObject *self)
12302
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12303
0
{
12304
0
    Py_ssize_t i, length;
12305
0
    int kind;
12306
0
    const void *data;
12307
12308
0
    length = PyUnicode_GET_LENGTH(self);
12309
0
    kind = PyUnicode_KIND(self);
12310
0
    data = PyUnicode_DATA(self);
12311
12312
    /* Shortcut for single character strings */
12313
0
    if (length == 1)
12314
0
        return PyBool_FromLong(
12315
0
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12316
12317
    /* Special case for empty strings */
12318
0
    if (length == 0)
12319
0
        Py_RETURN_FALSE;
12320
12321
0
    for (i = 0; i < length; i++) {
12322
0
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12323
0
            Py_RETURN_FALSE;
12324
0
    }
12325
0
    Py_RETURN_TRUE;
12326
0
}
12327
12328
/*[clinic input]
12329
str.isdigit as unicode_isdigit
12330
12331
Return True if the string is a digit string, False otherwise.
12332
12333
A string is a digit string if all characters in the string are digits and there
12334
is at least one character in the string.
12335
[clinic start generated code]*/
12336
12337
static PyObject *
12338
unicode_isdigit_impl(PyObject *self)
12339
/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12340
1.54M
{
12341
1.54M
    Py_ssize_t i, length;
12342
1.54M
    int kind;
12343
1.54M
    const void *data;
12344
12345
1.54M
    length = PyUnicode_GET_LENGTH(self);
12346
1.54M
    kind = PyUnicode_KIND(self);
12347
1.54M
    data = PyUnicode_DATA(self);
12348
12349
    /* Shortcut for single character strings */
12350
1.54M
    if (length == 1) {
12351
1.54M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12352
1.54M
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12353
1.54M
    }
12354
12355
    /* Special case for empty strings */
12356
306
    if (length == 0)
12357
0
        Py_RETURN_FALSE;
12358
12359
1.09k
    for (i = 0; i < length; i++) {
12360
786
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12361
0
            Py_RETURN_FALSE;
12362
786
    }
12363
306
    Py_RETURN_TRUE;
12364
306
}
12365
12366
/*[clinic input]
12367
str.isnumeric as unicode_isnumeric
12368
12369
Return True if the string is a numeric string, False otherwise.
12370
12371
A string is numeric if all characters in the string are numeric and there is at
12372
least one character in the string.
12373
[clinic start generated code]*/
12374
12375
static PyObject *
12376
unicode_isnumeric_impl(PyObject *self)
12377
/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12378
0
{
12379
0
    Py_ssize_t i, length;
12380
0
    int kind;
12381
0
    const void *data;
12382
12383
0
    length = PyUnicode_GET_LENGTH(self);
12384
0
    kind = PyUnicode_KIND(self);
12385
0
    data = PyUnicode_DATA(self);
12386
12387
    /* Shortcut for single character strings */
12388
0
    if (length == 1)
12389
0
        return PyBool_FromLong(
12390
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12391
12392
    /* Special case for empty strings */
12393
0
    if (length == 0)
12394
0
        Py_RETURN_FALSE;
12395
12396
0
    for (i = 0; i < length; i++) {
12397
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12398
0
            Py_RETURN_FALSE;
12399
0
    }
12400
0
    Py_RETURN_TRUE;
12401
0
}
12402
12403
Py_ssize_t
12404
_PyUnicode_ScanIdentifier(PyObject *self)
12405
13.7k
{
12406
13.7k
    Py_ssize_t i;
12407
13.7k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12408
13.7k
    if (len == 0) {
12409
        /* an empty string is not a valid identifier */
12410
0
        return 0;
12411
0
    }
12412
12413
13.7k
    int kind = PyUnicode_KIND(self);
12414
13.7k
    const void *data = PyUnicode_DATA(self);
12415
13.7k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12416
    /* PEP 3131 says that the first character must be in
12417
       XID_Start and subsequent characters in XID_Continue,
12418
       and for the ASCII range, the 2.x rules apply (i.e
12419
       start with letters and underscore, continue with
12420
       letters, digits, underscore). However, given the current
12421
       definition of XID_Start and XID_Continue, it is sufficient
12422
       to check just for these, except that _ must be allowed
12423
       as starting an identifier.  */
12424
13.7k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12425
405
        return 0;
12426
405
    }
12427
12428
69.4k
    for (i = 1; i < len; i++) {
12429
56.3k
        ch = PyUnicode_READ(kind, data, i);
12430
56.3k
        if (!_PyUnicode_IsXidContinue(ch)) {
12431
213
            return i;
12432
213
        }
12433
56.3k
    }
12434
13.0k
    return i;
12435
13.3k
}
12436
12437
int
12438
PyUnicode_IsIdentifier(PyObject *self)
12439
590
{
12440
590
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12441
590
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12442
    /* an empty string is not a valid identifier */
12443
590
    return len && i == len;
12444
590
}
12445
12446
/*[clinic input]
12447
str.isidentifier as unicode_isidentifier
12448
12449
Return True if the string is a valid Python identifier, False otherwise.
12450
12451
Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12452
such as "def" or "class".
12453
[clinic start generated code]*/
12454
12455
static PyObject *
12456
unicode_isidentifier_impl(PyObject *self)
12457
/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12458
269
{
12459
269
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12460
269
}
12461
12462
/*[clinic input]
12463
str.isprintable as unicode_isprintable
12464
12465
Return True if all characters in the string are printable, False otherwise.
12466
12467
A character is printable if repr() may use it in its output.
12468
[clinic start generated code]*/
12469
12470
static PyObject *
12471
unicode_isprintable_impl(PyObject *self)
12472
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=4e56bcc6b06ca18c]*/
12473
1.80M
{
12474
1.80M
    Py_ssize_t i, length;
12475
1.80M
    int kind;
12476
1.80M
    const void *data;
12477
12478
1.80M
    length = PyUnicode_GET_LENGTH(self);
12479
1.80M
    kind = PyUnicode_KIND(self);
12480
1.80M
    data = PyUnicode_DATA(self);
12481
12482
    /* Shortcut for single character strings */
12483
1.80M
    if (length == 1)
12484
1.80M
        return PyBool_FromLong(
12485
1.80M
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12486
12487
0
    for (i = 0; i < length; i++) {
12488
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12489
0
            Py_RETURN_FALSE;
12490
0
        }
12491
0
    }
12492
0
    Py_RETURN_TRUE;
12493
0
}
12494
12495
/*[clinic input]
12496
str.join as unicode_join
12497
12498
    iterable: object
12499
    /
12500
12501
Concatenate any number of strings.
12502
12503
The string whose method is called is inserted in between each given string.
12504
The result is returned as a new string.
12505
12506
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12507
[clinic start generated code]*/
12508
12509
static PyObject *
12510
unicode_join(PyObject *self, PyObject *iterable)
12511
/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12512
21.6M
{
12513
21.6M
    return PyUnicode_Join(self, iterable);
12514
21.6M
}
12515
12516
static Py_ssize_t
12517
unicode_length(PyObject *self)
12518
39.9M
{
12519
39.9M
    return PyUnicode_GET_LENGTH(self);
12520
39.9M
}
12521
12522
/*[clinic input]
12523
str.ljust as unicode_ljust
12524
12525
    width: Py_ssize_t
12526
    fillchar: Py_UCS4 = ' '
12527
    /
12528
12529
Return a left-justified string of length width.
12530
12531
Padding is done using the specified fill character (default is a space).
12532
[clinic start generated code]*/
12533
12534
static PyObject *
12535
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12536
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12537
0
{
12538
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12539
0
        return unicode_result_unchanged(self);
12540
12541
0
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12542
0
}
12543
12544
/*[clinic input]
12545
str.lower as unicode_lower
12546
12547
Return a copy of the string converted to lowercase.
12548
[clinic start generated code]*/
12549
12550
static PyObject *
12551
unicode_lower_impl(PyObject *self)
12552
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12553
117M
{
12554
117M
    if (PyUnicode_IS_ASCII(self))
12555
90.6M
        return ascii_upper_or_lower(self, 1);
12556
26.9M
    return case_operation(self, do_lower);
12557
117M
}
12558
12559
63.3M
#define LEFTSTRIP 0
12560
82.8M
#define RIGHTSTRIP 1
12561
39.0M
#define BOTHSTRIP 2
12562
12563
/* Arrays indexed by above */
12564
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12565
12566
0
#define STRIPNAME(i) (stripfuncnames[i])
12567
12568
/* externally visible for str.strip(unicode) */
12569
PyObject *
12570
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12571
8.74M
{
12572
8.74M
    const void *data;
12573
8.74M
    int kind;
12574
8.74M
    Py_ssize_t i, j, len;
12575
8.74M
    BLOOM_MASK sepmask;
12576
8.74M
    Py_ssize_t seplen;
12577
12578
8.74M
    kind = PyUnicode_KIND(self);
12579
8.74M
    data = PyUnicode_DATA(self);
12580
8.74M
    len = PyUnicode_GET_LENGTH(self);
12581
8.74M
    seplen = PyUnicode_GET_LENGTH(sepobj);
12582
8.74M
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12583
8.74M
                              PyUnicode_DATA(sepobj),
12584
8.74M
                              seplen);
12585
12586
8.74M
    i = 0;
12587
8.74M
    if (striptype != RIGHTSTRIP) {
12588
475k
        while (i < len) {
12589
473k
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12590
473k
            if (!BLOOM(sepmask, ch))
12591
437k
                break;
12592
35.3k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12593
1.84k
                break;
12594
33.5k
            i++;
12595
33.5k
        }
12596
441k
    }
12597
12598
8.74M
    j = len;
12599
8.74M
    if (striptype != LEFTSTRIP) {
12600
8.30M
        j--;
12601
8.74M
        while (j >= i) {
12602
3.11M
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12603
3.11M
            if (!BLOOM(sepmask, ch))
12604
2.63M
                break;
12605
473k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12606
29.0k
                break;
12607
444k
            j--;
12608
444k
        }
12609
12610
8.30M
        j++;
12611
8.30M
    }
12612
12613
8.74M
    return PyUnicode_Substring(self, i, j);
12614
8.74M
}
12615
12616
PyObject*
12617
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12618
301M
{
12619
301M
    const unsigned char *data;
12620
301M
    int kind;
12621
301M
    Py_ssize_t length;
12622
12623
301M
    length = PyUnicode_GET_LENGTH(self);
12624
301M
    end = Py_MIN(end, length);
12625
12626
301M
    if (start == 0 && end == length)
12627
55.2M
        return unicode_result_unchanged(self);
12628
12629
245M
    if (start < 0 || end < 0) {
12630
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12631
0
        return NULL;
12632
0
    }
12633
245M
    if (start >= length || end < start)
12634
168k
        _Py_RETURN_UNICODE_EMPTY();
12635
12636
245M
    length = end - start;
12637
245M
    if (PyUnicode_IS_ASCII(self)) {
12638
52.4M
        data = PyUnicode_1BYTE_DATA(self);
12639
52.4M
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12640
52.4M
    }
12641
193M
    else {
12642
193M
        kind = PyUnicode_KIND(self);
12643
193M
        data = PyUnicode_1BYTE_DATA(self);
12644
193M
        return PyUnicode_FromKindAndData(kind,
12645
193M
                                         data + kind * start,
12646
193M
                                         length);
12647
193M
    }
12648
245M
}
12649
12650
static PyObject *
12651
do_strip(PyObject *self, int striptype)
12652
53.0M
{
12653
53.0M
    Py_ssize_t len, i, j;
12654
12655
53.0M
    len = PyUnicode_GET_LENGTH(self);
12656
12657
53.0M
    if (PyUnicode_IS_ASCII(self)) {
12658
44.1M
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12659
12660
44.1M
        i = 0;
12661
44.1M
        if (striptype != RIGHTSTRIP) {
12662
32.9M
            while (i < len) {
12663
19.3M
                Py_UCS1 ch = data[i];
12664
19.3M
                if (!_Py_ascii_whitespace[ch])
12665
18.9M
                    break;
12666
459k
                i++;
12667
459k
            }
12668
32.5M
        }
12669
12670
44.1M
        j = len;
12671
44.1M
        if (striptype != LEFTSTRIP) {
12672
43.8M
            j--;
12673
55.1M
            while (j >= i) {
12674
35.0M
                Py_UCS1 ch = data[j];
12675
35.0M
                if (!_Py_ascii_whitespace[ch])
12676
23.8M
                    break;
12677
11.2M
                j--;
12678
11.2M
            }
12679
43.8M
            j++;
12680
43.8M
        }
12681
44.1M
    }
12682
8.91M
    else {
12683
8.91M
        int kind = PyUnicode_KIND(self);
12684
8.91M
        const void *data = PyUnicode_DATA(self);
12685
12686
8.91M
        i = 0;
12687
8.91M
        if (striptype != RIGHTSTRIP) {
12688
9.14M
            while (i < len) {
12689
9.13M
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12690
9.13M
                if (!Py_UNICODE_ISSPACE(ch))
12691
7.69M
                    break;
12692
1.43M
                i++;
12693
1.43M
            }
12694
7.70M
        }
12695
12696
8.91M
        j = len;
12697
8.91M
        if (striptype != LEFTSTRIP) {
12698
8.05M
            j--;
12699
8.59M
            while (j >= i) {
12700
8.54M
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12701
8.54M
                if (!Py_UNICODE_ISSPACE(ch))
12702
8.00M
                    break;
12703
534k
                j--;
12704
534k
            }
12705
8.05M
            j++;
12706
8.05M
        }
12707
8.91M
    }
12708
12709
53.0M
    return PyUnicode_Substring(self, i, j);
12710
53.0M
}
12711
12712
12713
static PyObject *
12714
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12715
61.7M
{
12716
61.7M
    if (sep != Py_None) {
12717
8.74M
        if (PyUnicode_Check(sep))
12718
8.74M
            return _PyUnicode_XStrip(self, striptype, sep);
12719
0
        else {
12720
0
            PyErr_Format(PyExc_TypeError,
12721
0
                         "%s arg must be None or str",
12722
0
                         STRIPNAME(striptype));
12723
0
            return NULL;
12724
0
        }
12725
8.74M
    }
12726
12727
53.0M
    return do_strip(self, striptype);
12728
61.7M
}
12729
12730
12731
/*[clinic input]
12732
str.strip as unicode_strip
12733
12734
    chars: object = None
12735
    /
12736
12737
Return a copy of the string with leading and trailing whitespace removed.
12738
12739
If chars is given and not None, remove characters in chars instead.
12740
[clinic start generated code]*/
12741
12742
static PyObject *
12743
unicode_strip_impl(PyObject *self, PyObject *chars)
12744
/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12745
39.0M
{
12746
39.0M
    return do_argstrip(self, BOTHSTRIP, chars);
12747
39.0M
}
12748
12749
12750
/*[clinic input]
12751
str.lstrip as unicode_lstrip
12752
12753
    chars: object = None
12754
    /
12755
12756
Return a copy of the string with leading whitespace removed.
12757
12758
If chars is given and not None, remove characters in chars instead.
12759
[clinic start generated code]*/
12760
12761
static PyObject *
12762
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12763
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12764
1.57M
{
12765
1.57M
    return do_argstrip(self, LEFTSTRIP, chars);
12766
1.57M
}
12767
12768
12769
/*[clinic input]
12770
str.rstrip as unicode_rstrip
12771
12772
    chars: object = None
12773
    /
12774
12775
Return a copy of the string with trailing whitespace removed.
12776
12777
If chars is given and not None, remove characters in chars instead.
12778
[clinic start generated code]*/
12779
12780
static PyObject *
12781
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12782
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12783
21.1M
{
12784
21.1M
    return do_argstrip(self, RIGHTSTRIP, chars);
12785
21.1M
}
12786
12787
12788
static PyObject*
12789
unicode_repeat(PyObject *str, Py_ssize_t len)
12790
296k
{
12791
296k
    PyObject *u;
12792
296k
    Py_ssize_t nchars, n;
12793
12794
296k
    if (len < 1)
12795
37.4k
        _Py_RETURN_UNICODE_EMPTY();
12796
12797
    /* no repeat, return original string */
12798
258k
    if (len == 1)
12799
87.1k
        return unicode_result_unchanged(str);
12800
12801
171k
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12802
0
        PyErr_SetString(PyExc_OverflowError,
12803
0
                        "repeated string is too long");
12804
0
        return NULL;
12805
0
    }
12806
171k
    nchars = len * PyUnicode_GET_LENGTH(str);
12807
12808
171k
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12809
171k
    if (!u)
12810
0
        return NULL;
12811
171k
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12812
12813
171k
    if (PyUnicode_GET_LENGTH(str) == 1) {
12814
168k
        int kind = PyUnicode_KIND(str);
12815
168k
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12816
168k
        if (kind == PyUnicode_1BYTE_KIND) {
12817
168k
            void *to = PyUnicode_DATA(u);
12818
168k
            memset(to, (unsigned char)fill_char, len);
12819
168k
        }
12820
0
        else if (kind == PyUnicode_2BYTE_KIND) {
12821
0
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12822
0
            for (n = 0; n < len; ++n)
12823
0
                ucs2[n] = fill_char;
12824
0
        } else {
12825
0
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12826
0
            assert(kind == PyUnicode_4BYTE_KIND);
12827
0
            for (n = 0; n < len; ++n)
12828
0
                ucs4[n] = fill_char;
12829
0
        }
12830
168k
    }
12831
3.20k
    else {
12832
3.20k
        Py_ssize_t char_size = PyUnicode_KIND(str);
12833
3.20k
        char *to = (char *) PyUnicode_DATA(u);
12834
3.20k
        _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12835
3.20k
            PyUnicode_GET_LENGTH(str) * char_size);
12836
3.20k
    }
12837
12838
171k
    assert(_PyUnicode_CheckConsistency(u, 1));
12839
171k
    return u;
12840
171k
}
12841
12842
PyObject *
12843
PyUnicode_Replace(PyObject *str,
12844
                  PyObject *substr,
12845
                  PyObject *replstr,
12846
                  Py_ssize_t maxcount)
12847
2
{
12848
2
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12849
2
            ensure_unicode(replstr) < 0)
12850
0
        return NULL;
12851
2
    return replace(str, substr, replstr, maxcount);
12852
2
}
12853
12854
/*[clinic input]
12855
str.replace as unicode_replace
12856
12857
    old: unicode
12858
    new: unicode
12859
    /
12860
    count: Py_ssize_t = -1
12861
        Maximum number of occurrences to replace.
12862
        -1 (the default value) means replace all occurrences.
12863
12864
Return a copy with all occurrences of substring old replaced by new.
12865
12866
If the optional argument count is given, only the first count occurrences are
12867
replaced.
12868
[clinic start generated code]*/
12869
12870
static PyObject *
12871
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12872
                     Py_ssize_t count)
12873
/*[clinic end generated code: output=b63f1a8b5eebf448 input=3345c455d60a5499]*/
12874
88.8M
{
12875
88.8M
    return replace(self, old, new, count);
12876
88.8M
}
12877
12878
/*[clinic input]
12879
str.removeprefix as unicode_removeprefix
12880
12881
    prefix: unicode
12882
    /
12883
12884
Return a str with the given prefix string removed if present.
12885
12886
If the string starts with the prefix string, return string[len(prefix):].
12887
Otherwise, return a copy of the original string.
12888
[clinic start generated code]*/
12889
12890
static PyObject *
12891
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12892
/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12893
0
{
12894
0
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12895
0
    if (match == -1) {
12896
0
        return NULL;
12897
0
    }
12898
0
    if (match) {
12899
0
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12900
0
                                   PyUnicode_GET_LENGTH(self));
12901
0
    }
12902
0
    return unicode_result_unchanged(self);
12903
0
}
12904
12905
/*[clinic input]
12906
str.removesuffix as unicode_removesuffix
12907
12908
    suffix: unicode
12909
    /
12910
12911
Return a str with the given suffix string removed if present.
12912
12913
If the string ends with the suffix string and that suffix is not empty,
12914
return string[:-len(suffix)]. Otherwise, return a copy of the original
12915
string.
12916
[clinic start generated code]*/
12917
12918
static PyObject *
12919
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12920
/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12921
0
{
12922
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12923
0
    if (match == -1) {
12924
0
        return NULL;
12925
0
    }
12926
0
    if (match) {
12927
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12928
0
                                            - PyUnicode_GET_LENGTH(suffix));
12929
0
    }
12930
0
    return unicode_result_unchanged(self);
12931
0
}
12932
12933
static PyObject *
12934
unicode_repr(PyObject *unicode)
12935
3.92M
{
12936
3.92M
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12937
3.92M
    const void *idata = PyUnicode_DATA(unicode);
12938
12939
    /* Compute length of output, quote characters, and
12940
       maximum character */
12941
3.92M
    Py_ssize_t osize = 0;
12942
3.92M
    Py_UCS4 maxch = 127;
12943
3.92M
    Py_ssize_t squote = 0;
12944
3.92M
    Py_ssize_t dquote = 0;
12945
3.92M
    int ikind = PyUnicode_KIND(unicode);
12946
144M
    for (Py_ssize_t i = 0; i < isize; i++) {
12947
140M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12948
140M
        Py_ssize_t incr = 1;
12949
140M
        switch (ch) {
12950
188k
        case '\'': squote++; break;
12951
577k
        case '"':  dquote++; break;
12952
200k
        case '\\': case '\t': case '\r': case '\n':
12953
200k
            incr = 2;
12954
200k
            break;
12955
140M
        default:
12956
            /* Fast-path ASCII */
12957
140M
            if (ch < ' ' || ch == 0x7f)
12958
87.5M
                incr = 4; /* \xHH */
12959
52.5M
            else if (ch < 0x7f)
12960
43.9M
                ;
12961
8.51M
            else if (Py_UNICODE_ISPRINTABLE(ch))
12962
8.40M
                maxch = (ch > maxch) ? ch : maxch;
12963
107k
            else if (ch < 0x100)
12964
23.9k
                incr = 4; /* \xHH */
12965
83.3k
            else if (ch < 0x10000)
12966
53.2k
                incr = 6; /* \uHHHH */
12967
30.1k
            else
12968
30.1k
                incr = 10; /* \uHHHHHHHH */
12969
140M
        }
12970
140M
        if (osize > PY_SSIZE_T_MAX - incr) {
12971
0
            PyErr_SetString(PyExc_OverflowError,
12972
0
                            "string is too long to generate repr");
12973
0
            return NULL;
12974
0
        }
12975
140M
        osize += incr;
12976
140M
    }
12977
12978
3.92M
    Py_UCS4 quote = '\'';
12979
3.92M
    int changed = (osize != isize);
12980
3.92M
    if (squote) {
12981
73.0k
        changed = 1;
12982
73.0k
        if (dquote)
12983
            /* Both squote and dquote present. Use squote,
12984
               and escape them */
12985
5.96k
            osize += squote;
12986
67.1k
        else
12987
67.1k
            quote = '"';
12988
73.0k
    }
12989
3.92M
    osize += 2;   /* quotes */
12990
12991
3.92M
    PyObject *repr = PyUnicode_New(osize, maxch);
12992
3.92M
    if (repr == NULL)
12993
0
        return NULL;
12994
3.92M
    int okind = PyUnicode_KIND(repr);
12995
3.92M
    void *odata = PyUnicode_DATA(repr);
12996
12997
3.92M
    if (!changed) {
12998
3.35M
        PyUnicode_WRITE(okind, odata, 0, quote);
12999
13000
3.35M
        _PyUnicode_FastCopyCharacters(repr, 1,
13001
3.35M
                                      unicode, 0,
13002
3.35M
                                      isize);
13003
13004
3.35M
        PyUnicode_WRITE(okind, odata, osize-1, quote);
13005
3.35M
    }
13006
574k
    else {
13007
574k
        switch (okind) {
13008
366k
        case PyUnicode_1BYTE_KIND:
13009
366k
            ucs1lib_repr(unicode, quote, odata);
13010
366k
            break;
13011
204k
        case PyUnicode_2BYTE_KIND:
13012
204k
            ucs2lib_repr(unicode, quote, odata);
13013
204k
            break;
13014
3.39k
        default:
13015
3.39k
            assert(okind == PyUnicode_4BYTE_KIND);
13016
3.39k
            ucs4lib_repr(unicode, quote, odata);
13017
574k
        }
13018
574k
    }
13019
13020
3.92M
    assert(_PyUnicode_CheckConsistency(repr, 1));
13021
3.92M
    return repr;
13022
3.92M
}
13023
13024
/*[clinic input]
13025
str.rfind as unicode_rfind = str.count
13026
13027
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
13028
13029
Optional arguments start and end are interpreted as in slice notation.
13030
Return -1 on failure.
13031
[clinic start generated code]*/
13032
13033
static Py_ssize_t
13034
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
13035
                   Py_ssize_t end)
13036
/*[clinic end generated code: output=880b29f01dd014c8 input=898361fb71f59294]*/
13037
10.1k
{
13038
10.1k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
13039
10.1k
    if (result < 0) {
13040
6.98k
        return -1;
13041
6.98k
    }
13042
3.14k
    return result;
13043
10.1k
}
13044
13045
/*[clinic input]
13046
str.rindex as unicode_rindex = str.count
13047
13048
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
13049
13050
Optional arguments start and end are interpreted as in slice notation.
13051
Raises ValueError when the substring is not found.
13052
[clinic start generated code]*/
13053
13054
static Py_ssize_t
13055
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
13056
                    Py_ssize_t end)
13057
/*[clinic end generated code: output=5f3aef124c867fe1 input=35943dead6c1ea9d]*/
13058
196k
{
13059
196k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
13060
196k
    if (result == -1) {
13061
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
13062
0
    }
13063
196k
    else if (result < 0) {
13064
0
        return -1;
13065
0
    }
13066
196k
    return result;
13067
196k
}
13068
13069
/*[clinic input]
13070
str.rjust as unicode_rjust
13071
13072
    width: Py_ssize_t
13073
    fillchar: Py_UCS4 = ' '
13074
    /
13075
13076
Return a right-justified string of length width.
13077
13078
Padding is done using the specified fill character (default is a space).
13079
[clinic start generated code]*/
13080
13081
static PyObject *
13082
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13083
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
13084
0
{
13085
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13086
0
        return unicode_result_unchanged(self);
13087
13088
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
13089
0
}
13090
13091
PyObject *
13092
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13093
0
{
13094
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13095
0
        return NULL;
13096
13097
0
    return split(s, sep, maxsplit);
13098
0
}
13099
13100
/*[clinic input]
13101
str.split as unicode_split
13102
13103
    sep: object = None
13104
        The separator used to split the string.
13105
13106
        When set to None (the default value), will split on any whitespace
13107
        character (including \n \r \t \f and spaces) and will discard
13108
        empty strings from the result.
13109
    maxsplit: Py_ssize_t = -1
13110
        Maximum number of splits.
13111
        -1 (the default value) means no limit.
13112
13113
Return a list of the substrings in the string, using sep as the separator string.
13114
13115
Splitting starts at the front of the string and works to the end.
13116
13117
Note, str.split() is mainly useful for data that has been intentionally
13118
delimited.  With natural text that includes punctuation, consider using
13119
the regular expression module.
13120
13121
[clinic start generated code]*/
13122
13123
static PyObject *
13124
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13125
/*[clinic end generated code: output=3a65b1db356948dc input=a29bcc0c7a5af0eb]*/
13126
24.2M
{
13127
24.2M
    if (sep == Py_None)
13128
165k
        return split(self, NULL, maxsplit);
13129
24.1M
    if (PyUnicode_Check(sep))
13130
24.1M
        return split(self, sep, maxsplit);
13131
13132
0
    PyErr_Format(PyExc_TypeError,
13133
0
                 "must be str or None, not %.100s",
13134
0
                 Py_TYPE(sep)->tp_name);
13135
0
    return NULL;
13136
24.1M
}
13137
13138
PyObject *
13139
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
13140
7.13M
{
13141
7.13M
    PyObject* out;
13142
7.13M
    int kind1, kind2;
13143
7.13M
    const void *buf1, *buf2;
13144
7.13M
    Py_ssize_t len1, len2;
13145
13146
7.13M
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13147
0
        return NULL;
13148
13149
7.13M
    kind1 = PyUnicode_KIND(str_obj);
13150
7.13M
    kind2 = PyUnicode_KIND(sep_obj);
13151
7.13M
    len1 = PyUnicode_GET_LENGTH(str_obj);
13152
7.13M
    len2 = PyUnicode_GET_LENGTH(sep_obj);
13153
7.13M
    if (kind1 < kind2 || len1 < len2) {
13154
866
        PyObject *empty = unicode_get_empty();  // Borrowed reference
13155
866
        return PyTuple_Pack(3, str_obj, empty, empty);
13156
866
    }
13157
7.13M
    buf1 = PyUnicode_DATA(str_obj);
13158
7.13M
    buf2 = PyUnicode_DATA(sep_obj);
13159
7.13M
    if (kind2 != kind1) {
13160
77.8k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
13161
77.8k
        if (!buf2)
13162
0
            return NULL;
13163
77.8k
    }
13164
13165
7.13M
    switch (kind1) {
13166
7.05M
    case PyUnicode_1BYTE_KIND:
13167
7.05M
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13168
2.45M
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13169
4.59M
        else
13170
4.59M
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13171
7.05M
        break;
13172
69.3k
    case PyUnicode_2BYTE_KIND:
13173
69.3k
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13174
69.3k
        break;
13175
8.58k
    case PyUnicode_4BYTE_KIND:
13176
8.58k
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13177
8.58k
        break;
13178
0
    default:
13179
0
        Py_UNREACHABLE();
13180
7.13M
    }
13181
13182
7.13M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13183
7.13M
    if (kind2 != kind1)
13184
77.8k
        PyMem_Free((void *)buf2);
13185
13186
7.13M
    return out;
13187
7.13M
}
13188
13189
13190
PyObject *
13191
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
13192
9.64k
{
13193
9.64k
    PyObject* out;
13194
9.64k
    int kind1, kind2;
13195
9.64k
    const void *buf1, *buf2;
13196
9.64k
    Py_ssize_t len1, len2;
13197
13198
9.64k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13199
0
        return NULL;
13200
13201
9.64k
    kind1 = PyUnicode_KIND(str_obj);
13202
9.64k
    kind2 = PyUnicode_KIND(sep_obj);
13203
9.64k
    len1 = PyUnicode_GET_LENGTH(str_obj);
13204
9.64k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
13205
9.64k
    if (kind1 < kind2 || len1 < len2) {
13206
0
        PyObject *empty = unicode_get_empty();  // Borrowed reference
13207
0
        return PyTuple_Pack(3, empty, empty, str_obj);
13208
0
    }
13209
9.64k
    buf1 = PyUnicode_DATA(str_obj);
13210
9.64k
    buf2 = PyUnicode_DATA(sep_obj);
13211
9.64k
    if (kind2 != kind1) {
13212
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
13213
0
        if (!buf2)
13214
0
            return NULL;
13215
0
    }
13216
13217
9.64k
    switch (kind1) {
13218
9.64k
    case PyUnicode_1BYTE_KIND:
13219
9.64k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13220
9.64k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13221
0
        else
13222
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13223
9.64k
        break;
13224
0
    case PyUnicode_2BYTE_KIND:
13225
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13226
0
        break;
13227
0
    case PyUnicode_4BYTE_KIND:
13228
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13229
0
        break;
13230
0
    default:
13231
0
        Py_UNREACHABLE();
13232
9.64k
    }
13233
13234
9.64k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13235
9.64k
    if (kind2 != kind1)
13236
0
        PyMem_Free((void *)buf2);
13237
13238
9.64k
    return out;
13239
9.64k
}
13240
13241
/*[clinic input]
13242
str.partition as unicode_partition
13243
13244
    sep: object
13245
    /
13246
13247
Partition the string into three parts using the given separator.
13248
13249
This will search for the separator in the string.  If the separator is found,
13250
returns a 3-tuple containing the part before the separator, the separator
13251
itself, and the part after it.
13252
13253
If the separator is not found, returns a 3-tuple containing the original string
13254
and two empty strings.
13255
[clinic start generated code]*/
13256
13257
static PyObject *
13258
unicode_partition(PyObject *self, PyObject *sep)
13259
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13260
7.13M
{
13261
7.13M
    return PyUnicode_Partition(self, sep);
13262
7.13M
}
13263
13264
/*[clinic input]
13265
str.rpartition as unicode_rpartition = str.partition
13266
13267
Partition the string into three parts using the given separator.
13268
13269
This will search for the separator in the string, starting at the end. If
13270
the separator is found, returns a 3-tuple containing the part before the
13271
separator, the separator itself, and the part after it.
13272
13273
If the separator is not found, returns a 3-tuple containing two empty strings
13274
and the original string.
13275
[clinic start generated code]*/
13276
13277
static PyObject *
13278
unicode_rpartition(PyObject *self, PyObject *sep)
13279
/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13280
9.64k
{
13281
9.64k
    return PyUnicode_RPartition(self, sep);
13282
9.64k
}
13283
13284
PyObject *
13285
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13286
0
{
13287
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13288
0
        return NULL;
13289
13290
0
    return rsplit(s, sep, maxsplit);
13291
0
}
13292
13293
/*[clinic input]
13294
str.rsplit as unicode_rsplit = str.split
13295
13296
Return a list of the substrings in the string, using sep as the separator string.
13297
13298
Splitting starts at the end of the string and works to the front.
13299
[clinic start generated code]*/
13300
13301
static PyObject *
13302
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13303
/*[clinic end generated code: output=c2b815c63bcabffc input=ea78406060fce33c]*/
13304
50
{
13305
50
    if (sep == Py_None)
13306
0
        return rsplit(self, NULL, maxsplit);
13307
50
    if (PyUnicode_Check(sep))
13308
50
        return rsplit(self, sep, maxsplit);
13309
13310
0
    PyErr_Format(PyExc_TypeError,
13311
0
                 "must be str or None, not %.100s",
13312
0
                 Py_TYPE(sep)->tp_name);
13313
0
    return NULL;
13314
50
}
13315
13316
/*[clinic input]
13317
str.splitlines as unicode_splitlines
13318
13319
    keepends: bool = False
13320
13321
Return a list of the lines in the string, breaking at line boundaries.
13322
13323
Line breaks are not included in the resulting list unless keepends is given and
13324
true.
13325
[clinic start generated code]*/
13326
13327
static PyObject *
13328
unicode_splitlines_impl(PyObject *self, int keepends)
13329
/*[clinic end generated code: output=f664dcdad153ec40 input=ba6ad05ee85d2b55]*/
13330
13.8k
{
13331
13.8k
    return PyUnicode_Splitlines(self, keepends);
13332
13.8k
}
13333
13334
static
13335
PyObject *unicode_str(PyObject *self)
13336
3.14M
{
13337
3.14M
    return unicode_result_unchanged(self);
13338
3.14M
}
13339
13340
/*[clinic input]
13341
str.swapcase as unicode_swapcase
13342
13343
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13344
[clinic start generated code]*/
13345
13346
static PyObject *
13347
unicode_swapcase_impl(PyObject *self)
13348
/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13349
0
{
13350
0
    return case_operation(self, do_swapcase);
13351
0
}
13352
13353
/*[clinic input]
13354
13355
@staticmethod
13356
str.maketrans as unicode_maketrans
13357
13358
  x: object
13359
13360
  y: unicode=NULL
13361
13362
  z: unicode=NULL
13363
13364
  /
13365
13366
Return a translation table usable for str.translate().
13367
13368
If there is only one argument, it must be a dictionary mapping Unicode
13369
ordinals (integers) or characters to Unicode ordinals, strings or None.
13370
Character keys will be then converted to ordinals.
13371
If there are two arguments, they must be strings of equal length, and
13372
in the resulting dictionary, each character in x will be mapped to the
13373
character at the same position in y. If there is a third argument, it
13374
must be a string, whose characters will be mapped to None in the result.
13375
[clinic start generated code]*/
13376
13377
static PyObject *
13378
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13379
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13380
0
{
13381
0
    PyObject *new = NULL, *key, *value;
13382
0
    Py_ssize_t i = 0;
13383
0
    int res;
13384
13385
0
    new = PyDict_New();
13386
0
    if (!new)
13387
0
        return NULL;
13388
0
    if (y != NULL) {
13389
0
        int x_kind, y_kind, z_kind;
13390
0
        const void *x_data, *y_data, *z_data;
13391
13392
        /* x must be a string too, of equal length */
13393
0
        if (!PyUnicode_Check(x)) {
13394
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13395
0
                            "be a string if there is a second argument");
13396
0
            goto err;
13397
0
        }
13398
0
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13399
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13400
0
                            "arguments must have equal length");
13401
0
            goto err;
13402
0
        }
13403
        /* create entries for translating chars in x to those in y */
13404
0
        x_kind = PyUnicode_KIND(x);
13405
0
        y_kind = PyUnicode_KIND(y);
13406
0
        x_data = PyUnicode_DATA(x);
13407
0
        y_data = PyUnicode_DATA(y);
13408
0
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13409
0
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13410
0
            if (!key)
13411
0
                goto err;
13412
0
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13413
0
            if (!value) {
13414
0
                Py_DECREF(key);
13415
0
                goto err;
13416
0
            }
13417
0
            res = PyDict_SetItem(new, key, value);
13418
0
            Py_DECREF(key);
13419
0
            Py_DECREF(value);
13420
0
            if (res < 0)
13421
0
                goto err;
13422
0
        }
13423
        /* create entries for deleting chars in z */
13424
0
        if (z != NULL) {
13425
0
            z_kind = PyUnicode_KIND(z);
13426
0
            z_data = PyUnicode_DATA(z);
13427
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13428
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13429
0
                if (!key)
13430
0
                    goto err;
13431
0
                res = PyDict_SetItem(new, key, Py_None);
13432
0
                Py_DECREF(key);
13433
0
                if (res < 0)
13434
0
                    goto err;
13435
0
            }
13436
0
        }
13437
0
    } else {
13438
0
        int kind;
13439
0
        const void *data;
13440
13441
        /* x must be a dict */
13442
0
        if (!PyDict_CheckExact(x)) {
13443
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13444
0
                            "to maketrans it must be a dict");
13445
0
            goto err;
13446
0
        }
13447
        /* copy entries into the new dict, converting string keys to int keys */
13448
0
        while (PyDict_Next(x, &i, &key, &value)) {
13449
0
            if (PyUnicode_Check(key)) {
13450
                /* convert string keys to integer keys */
13451
0
                PyObject *newkey;
13452
0
                if (PyUnicode_GET_LENGTH(key) != 1) {
13453
0
                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13454
0
                                    "table must be of length 1");
13455
0
                    goto err;
13456
0
                }
13457
0
                kind = PyUnicode_KIND(key);
13458
0
                data = PyUnicode_DATA(key);
13459
0
                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13460
0
                if (!newkey)
13461
0
                    goto err;
13462
0
                res = PyDict_SetItem(new, newkey, value);
13463
0
                Py_DECREF(newkey);
13464
0
                if (res < 0)
13465
0
                    goto err;
13466
0
            } else if (PyLong_Check(key)) {
13467
                /* just keep integer keys */
13468
0
                if (PyDict_SetItem(new, key, value) < 0)
13469
0
                    goto err;
13470
0
            } else {
13471
0
                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13472
0
                                "be strings or integers");
13473
0
                goto err;
13474
0
            }
13475
0
        }
13476
0
    }
13477
0
    return new;
13478
0
  err:
13479
0
    Py_DECREF(new);
13480
0
    return NULL;
13481
0
}
13482
13483
/*[clinic input]
13484
str.translate as unicode_translate
13485
13486
    table: object
13487
        Translation table, which must be a mapping of Unicode ordinals to
13488
        Unicode ordinals, strings, or None.
13489
    /
13490
13491
Replace each character in the string using the given translation table.
13492
13493
The table must implement lookup/indexing via __getitem__, for instance a
13494
dictionary or list.  If this operation raises LookupError, the character is
13495
left untouched.  Characters mapped to None are deleted.
13496
[clinic start generated code]*/
13497
13498
static PyObject *
13499
unicode_translate(PyObject *self, PyObject *table)
13500
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13501
8
{
13502
8
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13503
8
}
13504
13505
/*[clinic input]
13506
str.upper as unicode_upper
13507
13508
Return a copy of the string converted to uppercase.
13509
[clinic start generated code]*/
13510
13511
static PyObject *
13512
unicode_upper_impl(PyObject *self)
13513
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13514
0
{
13515
0
    if (PyUnicode_IS_ASCII(self))
13516
0
        return ascii_upper_or_lower(self, 0);
13517
0
    return case_operation(self, do_upper);
13518
0
}
13519
13520
/*[clinic input]
13521
str.zfill as unicode_zfill
13522
13523
    width: Py_ssize_t
13524
    /
13525
13526
Pad a numeric string with zeros on the left, to fill a field of the given width.
13527
13528
The string is never truncated.
13529
[clinic start generated code]*/
13530
13531
static PyObject *
13532
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13533
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13534
0
{
13535
0
    Py_ssize_t fill;
13536
0
    PyObject *u;
13537
0
    int kind;
13538
0
    const void *data;
13539
0
    Py_UCS4 chr;
13540
13541
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13542
0
        return unicode_result_unchanged(self);
13543
13544
0
    fill = width - PyUnicode_GET_LENGTH(self);
13545
13546
0
    u = pad(self, fill, 0, '0');
13547
13548
0
    if (u == NULL)
13549
0
        return NULL;
13550
13551
0
    kind = PyUnicode_KIND(u);
13552
0
    data = PyUnicode_DATA(u);
13553
0
    chr = PyUnicode_READ(kind, data, fill);
13554
13555
0
    if (chr == '+' || chr == '-') {
13556
        /* move sign to beginning of string */
13557
0
        PyUnicode_WRITE(kind, data, 0, chr);
13558
0
        PyUnicode_WRITE(kind, data, fill, '0');
13559
0
    }
13560
13561
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13562
0
    return u;
13563
0
}
13564
13565
/*[clinic input]
13566
@text_signature "($self, prefix[, start[, end]], /)"
13567
str.startswith as unicode_startswith
13568
13569
    prefix as subobj: object
13570
        A string or a tuple of strings to try.
13571
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13572
        Optional start position. Default: start of the string.
13573
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13574
        Optional stop position. Default: end of the string.
13575
    /
13576
13577
Return True if the string starts with the specified prefix, False otherwise.
13578
[clinic start generated code]*/
13579
13580
static PyObject *
13581
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13582
                        Py_ssize_t end)
13583
/*[clinic end generated code: output=4bd7cfd0803051d4 input=5f918b5f5f89d856]*/
13584
71.9M
{
13585
71.9M
    if (PyTuple_Check(subobj)) {
13586
9.75M
        Py_ssize_t i;
13587
35.5M
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13588
25.8M
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13589
25.8M
            if (!PyUnicode_Check(substring)) {
13590
0
                PyErr_Format(PyExc_TypeError,
13591
0
                             "tuple for startswith must only contain str, "
13592
0
                             "not %.100s",
13593
0
                             Py_TYPE(substring)->tp_name);
13594
0
                return NULL;
13595
0
            }
13596
25.8M
            int result = tailmatch(self, substring, start, end, -1);
13597
25.8M
            if (result < 0) {
13598
0
                return NULL;
13599
0
            }
13600
25.8M
            if (result) {
13601
43.8k
                Py_RETURN_TRUE;
13602
43.8k
            }
13603
25.8M
        }
13604
        /* nothing matched */
13605
9.75M
        Py_RETURN_FALSE;
13606
9.75M
    }
13607
62.2M
    if (!PyUnicode_Check(subobj)) {
13608
0
        PyErr_Format(PyExc_TypeError,
13609
0
                     "startswith first arg must be str or "
13610
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13611
0
        return NULL;
13612
0
    }
13613
62.2M
    int result = tailmatch(self, subobj, start, end, -1);
13614
62.2M
    if (result < 0) {
13615
0
        return NULL;
13616
0
    }
13617
62.2M
    return PyBool_FromLong(result);
13618
62.2M
}
13619
13620
13621
/*[clinic input]
13622
@text_signature "($self, suffix[, start[, end]], /)"
13623
str.endswith as unicode_endswith
13624
13625
    suffix as subobj: object
13626
        A string or a tuple of strings to try.
13627
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13628
        Optional start position. Default: start of the string.
13629
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13630
        Optional stop position. Default: end of the string.
13631
    /
13632
13633
Return True if the string ends with the specified suffix, False otherwise.
13634
[clinic start generated code]*/
13635
13636
static PyObject *
13637
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13638
                      Py_ssize_t end)
13639
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=00fbdc774a7d4d71]*/
13640
15.6M
{
13641
15.6M
    if (PyTuple_Check(subobj)) {
13642
200k
        Py_ssize_t i;
13643
358k
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13644
340k
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13645
340k
            if (!PyUnicode_Check(substring)) {
13646
0
                PyErr_Format(PyExc_TypeError,
13647
0
                             "tuple for endswith must only contain str, "
13648
0
                             "not %.100s",
13649
0
                             Py_TYPE(substring)->tp_name);
13650
0
                return NULL;
13651
0
            }
13652
340k
            int result = tailmatch(self, substring, start, end, +1);
13653
340k
            if (result < 0) {
13654
0
                return NULL;
13655
0
            }
13656
340k
            if (result) {
13657
182k
                Py_RETURN_TRUE;
13658
182k
            }
13659
340k
        }
13660
200k
        Py_RETURN_FALSE;
13661
200k
    }
13662
15.4M
    if (!PyUnicode_Check(subobj)) {
13663
0
        PyErr_Format(PyExc_TypeError,
13664
0
                     "endswith first arg must be str or "
13665
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13666
0
        return NULL;
13667
0
    }
13668
15.4M
    int result = tailmatch(self, subobj, start, end, +1);
13669
15.4M
    if (result < 0) {
13670
0
        return NULL;
13671
0
    }
13672
15.4M
    return PyBool_FromLong(result);
13673
15.4M
}
13674
13675
13676
static inline void
13677
_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13678
69.8M
{
13679
69.8M
    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13680
69.8M
    writer->data = PyUnicode_DATA(writer->buffer);
13681
13682
69.8M
    if (!writer->readonly) {
13683
69.8M
        writer->kind = PyUnicode_KIND(writer->buffer);
13684
69.8M
        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13685
69.8M
    }
13686
14.9k
    else {
13687
        /* use a value smaller than PyUnicode_1BYTE_KIND() so
13688
           _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13689
14.9k
        writer->kind = 0;
13690
14.9k
        assert(writer->kind <= PyUnicode_1BYTE_KIND);
13691
13692
        /* Copy-on-write mode: set buffer size to 0 so
13693
         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13694
         * next write. */
13695
14.9k
        writer->size = 0;
13696
14.9k
    }
13697
69.8M
}
13698
13699
13700
void
13701
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13702
56.2M
{
13703
56.2M
    memset(writer, 0, sizeof(*writer));
13704
13705
    /* ASCII is the bare minimum */
13706
56.2M
    writer->min_char = 127;
13707
13708
    /* use a kind value smaller than PyUnicode_1BYTE_KIND so
13709
       _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13710
56.2M
    assert(writer->kind == 0);
13711
56.2M
    assert(writer->kind < PyUnicode_1BYTE_KIND);
13712
56.2M
}
13713
13714
13715
PyUnicodeWriter*
13716
PyUnicodeWriter_Create(Py_ssize_t length)
13717
4.70M
{
13718
4.70M
    if (length < 0) {
13719
0
        PyErr_SetString(PyExc_ValueError,
13720
0
                        "length must be positive");
13721
0
        return NULL;
13722
0
    }
13723
13724
4.70M
    const size_t size = sizeof(_PyUnicodeWriter);
13725
4.70M
    PyUnicodeWriter *pub_writer;
13726
4.70M
    pub_writer = _Py_FREELIST_POP_MEM(unicode_writers);
13727
4.70M
    if (pub_writer == NULL) {
13728
2.61M
        pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
13729
2.61M
        if (pub_writer == NULL) {
13730
0
            return (PyUnicodeWriter *)PyErr_NoMemory();
13731
0
        }
13732
2.61M
    }
13733
4.70M
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
13734
13735
4.70M
    _PyUnicodeWriter_Init(writer);
13736
4.70M
    if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
13737
0
        PyUnicodeWriter_Discard(pub_writer);
13738
0
        return NULL;
13739
0
    }
13740
4.70M
    writer->overallocate = 1;
13741
13742
4.70M
    return pub_writer;
13743
4.70M
}
13744
13745
13746
void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
13747
66.4k
{
13748
66.4k
    if (writer == NULL) {
13749
65.9k
        return;
13750
65.9k
    }
13751
519
    _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
13752
519
    _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
13753
519
}
13754
13755
13756
// Initialize _PyUnicodeWriter with initial buffer
13757
static inline void
13758
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13759
504k
{
13760
504k
    memset(writer, 0, sizeof(*writer));
13761
504k
    writer->buffer = buffer;
13762
504k
    _PyUnicodeWriter_Update(writer);
13763
504k
    writer->min_length = writer->size;
13764
504k
}
13765
13766
13767
int
13768
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13769
                                 Py_ssize_t length, Py_UCS4 maxchar)
13770
69.3M
{
13771
69.3M
    Py_ssize_t newlen;
13772
69.3M
    PyObject *newbuffer;
13773
13774
69.3M
    assert(length >= 0);
13775
69.3M
    assert(maxchar <= MAX_UNICODE);
13776
13777
    /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13778
69.3M
    assert((maxchar > writer->maxchar && length >= 0)
13779
69.3M
           || length > 0);
13780
13781
69.3M
    if (length > PY_SSIZE_T_MAX - writer->pos) {
13782
0
        PyErr_NoMemory();
13783
0
        return -1;
13784
0
    }
13785
69.3M
    newlen = writer->pos + length;
13786
13787
69.3M
    maxchar = Py_MAX(maxchar, writer->min_char);
13788
13789
69.3M
    if (writer->buffer == NULL) {
13790
51.1M
        assert(!writer->readonly);
13791
51.1M
        if (writer->overallocate
13792
51.1M
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13793
            /* overallocate to limit the number of realloc() */
13794
39.4M
            newlen += newlen / OVERALLOCATE_FACTOR;
13795
39.4M
        }
13796
51.1M
        if (newlen < writer->min_length)
13797
46.3M
            newlen = writer->min_length;
13798
13799
51.1M
        writer->buffer = PyUnicode_New(newlen, maxchar);
13800
51.1M
        if (writer->buffer == NULL)
13801
0
            return -1;
13802
51.1M
    }
13803
18.2M
    else if (newlen > writer->size) {
13804
15.4M
        if (writer->overallocate
13805
15.4M
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13806
            /* overallocate to limit the number of realloc() */
13807
15.1M
            newlen += newlen / OVERALLOCATE_FACTOR;
13808
15.1M
        }
13809
15.4M
        if (newlen < writer->min_length)
13810
1.07k
            newlen = writer->min_length;
13811
13812
15.4M
        if (maxchar > writer->maxchar || writer->readonly) {
13813
            /* resize + widen */
13814
3.71M
            maxchar = Py_MAX(maxchar, writer->maxchar);
13815
3.71M
            newbuffer = PyUnicode_New(newlen, maxchar);
13816
3.71M
            if (newbuffer == NULL)
13817
0
                return -1;
13818
3.71M
            _PyUnicode_FastCopyCharacters(newbuffer, 0,
13819
3.71M
                                          writer->buffer, 0, writer->pos);
13820
3.71M
            Py_DECREF(writer->buffer);
13821
3.71M
            writer->readonly = 0;
13822
3.71M
        }
13823
11.7M
        else {
13824
11.7M
            newbuffer = resize_compact(writer->buffer, newlen);
13825
11.7M
            if (newbuffer == NULL)
13826
0
                return -1;
13827
11.7M
        }
13828
15.4M
        writer->buffer = newbuffer;
13829
15.4M
    }
13830
2.74M
    else if (maxchar > writer->maxchar) {
13831
2.74M
        assert(!writer->readonly);
13832
2.74M
        newbuffer = PyUnicode_New(writer->size, maxchar);
13833
2.74M
        if (newbuffer == NULL)
13834
0
            return -1;
13835
2.74M
        _PyUnicode_FastCopyCharacters(newbuffer, 0,
13836
2.74M
                                      writer->buffer, 0, writer->pos);
13837
2.74M
        Py_SETREF(writer->buffer, newbuffer);
13838
2.74M
    }
13839
69.3M
    _PyUnicodeWriter_Update(writer);
13840
69.3M
    return 0;
13841
13842
69.3M
#undef OVERALLOCATE_FACTOR
13843
69.3M
}
13844
13845
int
13846
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13847
                                     int kind)
13848
140k
{
13849
140k
    Py_UCS4 maxchar;
13850
13851
    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13852
140k
    assert(writer->kind < kind);
13853
13854
140k
    switch (kind)
13855
140k
    {
13856
0
    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13857
140k
    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13858
0
    case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
13859
0
    default:
13860
0
        Py_UNREACHABLE();
13861
140k
    }
13862
13863
140k
    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13864
140k
}
13865
13866
static inline int
13867
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13868
288M
{
13869
288M
    assert(ch <= MAX_UNICODE);
13870
288M
    if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13871
0
        return -1;
13872
288M
    PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13873
288M
    writer->pos++;
13874
288M
    return 0;
13875
288M
}
13876
13877
int
13878
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13879
104M
{
13880
104M
    return _PyUnicodeWriter_WriteCharInline(writer, ch);
13881
104M
}
13882
13883
int
13884
PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
13885
73.1M
{
13886
73.1M
    if (ch > MAX_UNICODE) {
13887
0
        PyErr_SetString(PyExc_ValueError,
13888
0
                        "character must be in range(0x110000)");
13889
0
        return -1;
13890
0
    }
13891
13892
73.1M
    return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
13893
73.1M
}
13894
13895
int
13896
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13897
70.1M
{
13898
70.1M
    assert(PyUnicode_Check(str));
13899
13900
70.1M
    Py_UCS4 maxchar;
13901
70.1M
    Py_ssize_t len;
13902
13903
70.1M
    len = PyUnicode_GET_LENGTH(str);
13904
70.1M
    if (len == 0)
13905
25.6M
        return 0;
13906
44.5M
    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13907
44.5M
    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13908
23.6M
        if (writer->buffer == NULL && !writer->overallocate) {
13909
7.32k
            assert(_PyUnicode_CheckConsistency(str, 1));
13910
7.32k
            writer->readonly = 1;
13911
7.32k
            writer->buffer = Py_NewRef(str);
13912
7.32k
            _PyUnicodeWriter_Update(writer);
13913
7.32k
            writer->pos += len;
13914
7.32k
            return 0;
13915
7.32k
        }
13916
23.6M
        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13917
0
            return -1;
13918
23.6M
    }
13919
44.5M
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13920
44.5M
                                  str, 0, len);
13921
44.5M
    writer->pos += len;
13922
44.5M
    return 0;
13923
44.5M
}
13924
13925
int
13926
PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
13927
5.78M
{
13928
5.78M
    PyTypeObject *type = Py_TYPE(obj);
13929
5.78M
    if (type == &PyUnicode_Type) {
13930
5.78M
        return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj);
13931
5.78M
    }
13932
13933
0
    if (type == &PyLong_Type) {
13934
0
        return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
13935
0
    }
13936
13937
0
    PyObject *str = PyObject_Str(obj);
13938
0
    if (str == NULL) {
13939
0
        return -1;
13940
0
    }
13941
13942
0
    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
13943
0
    Py_DECREF(str);
13944
0
    return res;
13945
0
}
13946
13947
13948
int
13949
PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj)
13950
9.46M
{
13951
9.46M
    if (Py_TYPE(obj) == &PyLong_Type) {
13952
1.98M
        return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
13953
1.98M
    }
13954
13955
7.48M
    PyObject *repr = PyObject_Repr(obj);
13956
7.48M
    if (repr == NULL) {
13957
0
        return -1;
13958
0
    }
13959
13960
7.48M
    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr);
13961
7.48M
    Py_DECREF(repr);
13962
7.48M
    return res;
13963
7.48M
}
13964
13965
13966
int
13967
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13968
                                Py_ssize_t start, Py_ssize_t end)
13969
71.9M
{
13970
71.9M
    assert(0 <= start);
13971
71.9M
    assert(end <= PyUnicode_GET_LENGTH(str));
13972
71.9M
    assert(start <= end);
13973
13974
71.9M
    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13975
98
        return _PyUnicodeWriter_WriteStr(writer, str);
13976
13977
71.9M
    Py_ssize_t len = end - start;
13978
71.9M
    if (len == 0) {
13979
0
        return 0;
13980
0
    }
13981
13982
71.9M
    Py_UCS4 maxchar;
13983
71.9M
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) {
13984
15.4M
        maxchar = _PyUnicode_FindMaxChar(str, start, end);
13985
15.4M
    }
13986
56.4M
    else {
13987
56.4M
        maxchar = writer->maxchar;
13988
56.4M
    }
13989
71.9M
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) {
13990
0
        return -1;
13991
0
    }
13992
13993
71.9M
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13994
71.9M
                                  str, start, len);
13995
71.9M
    writer->pos += len;
13996
71.9M
    return 0;
13997
71.9M
}
13998
13999
14000
int
14001
PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str,
14002
                               Py_ssize_t start, Py_ssize_t end)
14003
702k
{
14004
702k
    if (!PyUnicode_Check(str)) {
14005
0
        PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
14006
0
        return -1;
14007
0
    }
14008
702k
    if (start < 0 || start > end) {
14009
0
        PyErr_Format(PyExc_ValueError, "invalid start argument");
14010
0
        return -1;
14011
0
    }
14012
702k
    if (end > PyUnicode_GET_LENGTH(str)) {
14013
0
        PyErr_Format(PyExc_ValueError, "invalid end argument");
14014
0
        return -1;
14015
0
    }
14016
14017
702k
    return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
14018
702k
                                           start, end);
14019
702k
}
14020
14021
14022
int
14023
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14024
                                  const char *ascii, Py_ssize_t len)
14025
56.8M
{
14026
56.8M
    if (len == -1)
14027
0
        len = strlen(ascii);
14028
14029
56.8M
    assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
14030
14031
56.8M
    if (writer->buffer == NULL && !writer->overallocate) {
14032
7.62k
        PyObject *str;
14033
14034
7.62k
        str = _PyUnicode_FromASCII(ascii, len);
14035
7.62k
        if (str == NULL)
14036
0
            return -1;
14037
14038
7.62k
        writer->readonly = 1;
14039
7.62k
        writer->buffer = str;
14040
7.62k
        _PyUnicodeWriter_Update(writer);
14041
7.62k
        writer->pos += len;
14042
7.62k
        return 0;
14043
7.62k
    }
14044
14045
56.8M
    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14046
0
        return -1;
14047
14048
56.8M
    switch (writer->kind)
14049
56.8M
    {
14050
56.7M
    case PyUnicode_1BYTE_KIND:
14051
56.7M
    {
14052
56.7M
        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14053
56.7M
        Py_UCS1 *data = writer->data;
14054
14055
56.7M
        memcpy(data + writer->pos, str, len);
14056
56.7M
        break;
14057
0
    }
14058
9.73k
    case PyUnicode_2BYTE_KIND:
14059
9.73k
    {
14060
9.73k
        _PyUnicode_CONVERT_BYTES(
14061
9.73k
            Py_UCS1, Py_UCS2,
14062
9.73k
            ascii, ascii + len,
14063
9.73k
            (Py_UCS2 *)writer->data + writer->pos);
14064
9.73k
        break;
14065
0
    }
14066
4.03k
    case PyUnicode_4BYTE_KIND:
14067
4.03k
    {
14068
4.03k
        _PyUnicode_CONVERT_BYTES(
14069
4.03k
            Py_UCS1, Py_UCS4,
14070
4.03k
            ascii, ascii + len,
14071
4.03k
            (Py_UCS4 *)writer->data + writer->pos);
14072
4.03k
        break;
14073
0
    }
14074
0
    default:
14075
0
        Py_UNREACHABLE();
14076
56.8M
    }
14077
14078
56.8M
    writer->pos += len;
14079
56.8M
    return 0;
14080
56.8M
}
14081
14082
14083
int
14084
PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer,
14085
                           const char *str,
14086
                           Py_ssize_t size)
14087
740k
{
14088
740k
    assert(writer != NULL);
14089
740k
    _Py_AssertHoldsTstate();
14090
14091
740k
    _PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer;
14092
740k
    return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size);
14093
740k
}
14094
14095
14096
int
14097
PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
14098
                          const char *str,
14099
                          Py_ssize_t size)
14100
0
{
14101
0
    if (size < 0) {
14102
0
        size = strlen(str);
14103
0
    }
14104
14105
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
14106
0
    Py_ssize_t old_pos = _writer->pos;
14107
0
    int res = unicode_decode_utf8_writer(_writer, str, size,
14108
0
                                         _Py_ERROR_STRICT, NULL, NULL);
14109
0
    if (res < 0) {
14110
0
        _writer->pos = old_pos;
14111
0
    }
14112
0
    return res;
14113
0
}
14114
14115
14116
int
14117
PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
14118
                                   const char *string,
14119
                                   Py_ssize_t length,
14120
                                   const char *errors,
14121
                                   Py_ssize_t *consumed)
14122
0
{
14123
0
    if (length < 0) {
14124
0
        length = strlen(string);
14125
0
    }
14126
14127
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
14128
0
    Py_ssize_t old_pos = _writer->pos;
14129
0
    int res = unicode_decode_utf8_writer(_writer, string, length,
14130
0
                                         _Py_ERROR_UNKNOWN, errors, consumed);
14131
0
    if (res < 0) {
14132
0
        _writer->pos = old_pos;
14133
0
        if (consumed) {
14134
0
            *consumed = 0;
14135
0
        }
14136
0
    }
14137
0
    return res;
14138
0
}
14139
14140
14141
int
14142
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14143
                                   const char *str, Py_ssize_t len)
14144
0
{
14145
0
    Py_UCS4 maxchar;
14146
14147
0
    maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
14148
0
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14149
0
        return -1;
14150
0
    unicode_write_cstr(writer->buffer, writer->pos, str, len);
14151
0
    writer->pos += len;
14152
0
    return 0;
14153
0
}
14154
14155
PyObject *
14156
_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
14157
51.5M
{
14158
51.5M
    PyObject *str;
14159
14160
51.5M
    if (writer->pos == 0) {
14161
712
        Py_CLEAR(writer->buffer);
14162
712
        _Py_RETURN_UNICODE_EMPTY();
14163
712
    }
14164
14165
51.5M
    str = writer->buffer;
14166
51.5M
    writer->buffer = NULL;
14167
14168
51.5M
    if (writer->readonly) {
14169
13.8k
        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14170
13.8k
        return str;
14171
13.8k
    }
14172
14173
51.5M
    if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14174
50.6M
        PyObject *str2;
14175
50.6M
        str2 = resize_compact(str, writer->pos);
14176
50.6M
        if (str2 == NULL) {
14177
0
            Py_DECREF(str);
14178
0
            return NULL;
14179
0
        }
14180
50.6M
        str = str2;
14181
50.6M
    }
14182
14183
51.5M
    assert(_PyUnicode_CheckConsistency(str, 1));
14184
51.5M
    return unicode_result(str);
14185
51.5M
}
14186
14187
14188
PyObject*
14189
PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
14190
4.70M
{
14191
4.70M
    PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer);
14192
4.70M
    assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
14193
4.70M
    _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
14194
4.70M
    return str;
14195
4.70M
}
14196
14197
14198
void
14199
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14200
5.22M
{
14201
5.22M
    Py_CLEAR(writer->buffer);
14202
5.22M
}
14203
14204
#include "stringlib/unicode_format.h"
14205
14206
PyDoc_STRVAR(format__doc__,
14207
             "format($self, /, *args, **kwargs)\n\
14208
--\n\
14209
\n\
14210
Return a formatted version of the string, using substitutions from args and kwargs.\n\
14211
The substitutions are identified by braces ('{' and '}').");
14212
14213
PyDoc_STRVAR(format_map__doc__,
14214
             "format_map($self, mapping, /)\n\
14215
--\n\
14216
\n\
14217
Return a formatted version of the string, using substitutions from mapping.\n\
14218
The substitutions are identified by braces ('{' and '}').");
14219
14220
/*[clinic input]
14221
str.__format__ as unicode___format__
14222
14223
    format_spec: unicode
14224
    /
14225
14226
Return a formatted version of the string as described by format_spec.
14227
[clinic start generated code]*/
14228
14229
static PyObject *
14230
unicode___format___impl(PyObject *self, PyObject *format_spec)
14231
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
14232
0
{
14233
0
    _PyUnicodeWriter writer;
14234
0
    int ret;
14235
14236
0
    _PyUnicodeWriter_Init(&writer);
14237
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
14238
0
                                          self, format_spec, 0,
14239
0
                                          PyUnicode_GET_LENGTH(format_spec));
14240
0
    if (ret == -1) {
14241
0
        _PyUnicodeWriter_Dealloc(&writer);
14242
0
        return NULL;
14243
0
    }
14244
0
    return _PyUnicodeWriter_Finish(&writer);
14245
0
}
14246
14247
/*[clinic input]
14248
str.__sizeof__ as unicode_sizeof
14249
14250
Return the size of the string in memory, in bytes.
14251
[clinic start generated code]*/
14252
14253
static PyObject *
14254
unicode_sizeof_impl(PyObject *self)
14255
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
14256
0
{
14257
0
    Py_ssize_t size;
14258
14259
    /* If it's a compact object, account for base structure +
14260
       character data. */
14261
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
14262
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14263
0
    }
14264
0
    else if (PyUnicode_IS_COMPACT(self)) {
14265
0
        size = sizeof(PyCompactUnicodeObject) +
14266
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
14267
0
    }
14268
0
    else {
14269
        /* If it is a two-block object, account for base object, and
14270
           for character block if present. */
14271
0
        size = sizeof(PyUnicodeObject);
14272
0
        if (_PyUnicode_DATA_ANY(self))
14273
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
14274
0
                PyUnicode_KIND(self);
14275
0
    }
14276
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
14277
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
14278
14279
0
    return PyLong_FromSsize_t(size);
14280
0
}
14281
14282
static PyObject *
14283
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
14284
0
{
14285
0
    PyObject *copy = _PyUnicode_Copy(v);
14286
0
    if (!copy)
14287
0
        return NULL;
14288
0
    return Py_BuildValue("(N)", copy);
14289
0
}
14290
14291
/*
14292
This function searchs the longest common leading whitespace
14293
of all lines in the [src, end).
14294
It returns the length of the common leading whitespace and sets `output` to
14295
point to the beginning of the common leading whitespace if length > 0.
14296
*/
14297
static Py_ssize_t
14298
search_longest_common_leading_whitespace(
14299
    const char *const src,
14300
    const char *const end,
14301
    const char **output)
14302
0
{
14303
    // [_start, _start + _len)
14304
    // describes the current longest common leading whitespace
14305
0
    const char *_start = NULL;
14306
0
    Py_ssize_t _len = 0;
14307
14308
0
    for (const char *iter = src; iter < end; ++iter) {
14309
0
        const char *line_start = iter;
14310
0
        const char *leading_whitespace_end = NULL;
14311
14312
        // scan the whole line
14313
0
        while (iter < end && *iter != '\n') {
14314
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
14315
                /* `iter` points to the first non-whitespace character
14316
                   in this line */
14317
0
                if (iter == line_start) {
14318
                    // some line has no indent, fast exit!
14319
0
                    return 0;
14320
0
                }
14321
0
                leading_whitespace_end = iter;
14322
0
            }
14323
0
            ++iter;
14324
0
        }
14325
14326
        // if this line has all white space, skip it
14327
0
        if (!leading_whitespace_end) {
14328
0
            continue;
14329
0
        }
14330
14331
0
        if (!_start) {
14332
            // update the first leading whitespace
14333
0
            _start = line_start;
14334
0
            _len = leading_whitespace_end - line_start;
14335
0
            assert(_len > 0);
14336
0
        }
14337
0
        else {
14338
            /* We then compare with the current longest leading whitespace.
14339
14340
               [line_start, leading_whitespace_end) is the leading
14341
               whitespace of this line,
14342
14343
               [_start, _start + _len) is the leading whitespace of the
14344
               current longest leading whitespace. */
14345
0
            Py_ssize_t new_len = 0;
14346
0
            const char *_iter = _start, *line_iter = line_start;
14347
14348
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
14349
0
                   && *_iter == *line_iter)
14350
0
            {
14351
0
                ++_iter;
14352
0
                ++line_iter;
14353
0
                ++new_len;
14354
0
            }
14355
14356
0
            _len = new_len;
14357
0
            if (_len == 0) {
14358
                // No common things now, fast exit!
14359
0
                return 0;
14360
0
            }
14361
0
        }
14362
0
    }
14363
14364
0
    assert(_len >= 0);
14365
0
    if (_len > 0) {
14366
0
        *output = _start;
14367
0
    }
14368
0
    return _len;
14369
0
}
14370
14371
/* Dedent a string.
14372
   Behaviour is expected to be an exact match of `textwrap.dedent`.
14373
   Return a new reference on success, NULL with exception set on error.
14374
   */
14375
PyObject *
14376
_PyUnicode_Dedent(PyObject *unicode)
14377
0
{
14378
0
    Py_ssize_t src_len = 0;
14379
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
14380
0
    if (!src) {
14381
0
        return NULL;
14382
0
    }
14383
0
    assert(src_len >= 0);
14384
0
    if (src_len == 0) {
14385
0
        return Py_NewRef(unicode);
14386
0
    }
14387
14388
0
    const char *const end = src + src_len;
14389
14390
    // [whitespace_start, whitespace_start + whitespace_len)
14391
    // describes the current longest common leading whitespace
14392
0
    const char *whitespace_start = NULL;
14393
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
14394
0
        src, end, &whitespace_start);
14395
14396
0
    if (whitespace_len == 0) {
14397
0
        return Py_NewRef(unicode);
14398
0
    }
14399
14400
    // now we should trigger a dedent
14401
0
    char *dest = PyMem_Malloc(src_len);
14402
0
    if (!dest) {
14403
0
        PyErr_NoMemory();
14404
0
        return NULL;
14405
0
    }
14406
0
    char *dest_iter = dest;
14407
14408
0
    for (const char *iter = src; iter < end; ++iter) {
14409
0
        const char *line_start = iter;
14410
0
        bool in_leading_space = true;
14411
14412
        // iterate over a line to find the end of a line
14413
0
        while (iter < end && *iter != '\n') {
14414
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
14415
0
                in_leading_space = false;
14416
0
            }
14417
0
            ++iter;
14418
0
        }
14419
14420
        // invariant: *iter == '\n' or iter == end
14421
0
        bool append_newline = iter < end;
14422
14423
        // if this line has all white space, write '\n' and continue
14424
0
        if (in_leading_space && append_newline) {
14425
0
            *dest_iter++ = '\n';
14426
0
            continue;
14427
0
        }
14428
14429
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
14430
            conditionally append '\n' */
14431
14432
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
14433
0
        assert(new_line_len >= 0);
14434
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
14435
14436
0
        dest_iter += new_line_len;
14437
14438
0
        if (append_newline) {
14439
0
            *dest_iter++ = '\n';
14440
0
        }
14441
0
    }
14442
14443
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
14444
0
    PyMem_Free(dest);
14445
0
    return res;
14446
0
}
14447
14448
static PyMethodDef unicode_methods[] = {
14449
    UNICODE_ENCODE_METHODDEF
14450
    UNICODE_REPLACE_METHODDEF
14451
    UNICODE_SPLIT_METHODDEF
14452
    UNICODE_RSPLIT_METHODDEF
14453
    UNICODE_JOIN_METHODDEF
14454
    UNICODE_CAPITALIZE_METHODDEF
14455
    UNICODE_CASEFOLD_METHODDEF
14456
    UNICODE_TITLE_METHODDEF
14457
    UNICODE_CENTER_METHODDEF
14458
    UNICODE_COUNT_METHODDEF
14459
    UNICODE_EXPANDTABS_METHODDEF
14460
    UNICODE_FIND_METHODDEF
14461
    UNICODE_PARTITION_METHODDEF
14462
    UNICODE_INDEX_METHODDEF
14463
    UNICODE_LJUST_METHODDEF
14464
    UNICODE_LOWER_METHODDEF
14465
    UNICODE_LSTRIP_METHODDEF
14466
    UNICODE_RFIND_METHODDEF
14467
    UNICODE_RINDEX_METHODDEF
14468
    UNICODE_RJUST_METHODDEF
14469
    UNICODE_RSTRIP_METHODDEF
14470
    UNICODE_RPARTITION_METHODDEF
14471
    UNICODE_SPLITLINES_METHODDEF
14472
    UNICODE_STRIP_METHODDEF
14473
    UNICODE_SWAPCASE_METHODDEF
14474
    UNICODE_TRANSLATE_METHODDEF
14475
    UNICODE_UPPER_METHODDEF
14476
    UNICODE_STARTSWITH_METHODDEF
14477
    UNICODE_ENDSWITH_METHODDEF
14478
    UNICODE_REMOVEPREFIX_METHODDEF
14479
    UNICODE_REMOVESUFFIX_METHODDEF
14480
    UNICODE_ISASCII_METHODDEF
14481
    UNICODE_ISLOWER_METHODDEF
14482
    UNICODE_ISUPPER_METHODDEF
14483
    UNICODE_ISTITLE_METHODDEF
14484
    UNICODE_ISSPACE_METHODDEF
14485
    UNICODE_ISDECIMAL_METHODDEF
14486
    UNICODE_ISDIGIT_METHODDEF
14487
    UNICODE_ISNUMERIC_METHODDEF
14488
    UNICODE_ISALPHA_METHODDEF
14489
    UNICODE_ISALNUM_METHODDEF
14490
    UNICODE_ISIDENTIFIER_METHODDEF
14491
    UNICODE_ISPRINTABLE_METHODDEF
14492
    UNICODE_ZFILL_METHODDEF
14493
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
14494
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
14495
    UNICODE___FORMAT___METHODDEF
14496
    UNICODE_MAKETRANS_METHODDEF
14497
    UNICODE_SIZEOF_METHODDEF
14498
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
14499
    {NULL, NULL}
14500
};
14501
14502
static PyObject *
14503
unicode_mod(PyObject *v, PyObject *w)
14504
26.5M
{
14505
26.5M
    if (!PyUnicode_Check(v))
14506
0
        Py_RETURN_NOTIMPLEMENTED;
14507
26.5M
    return PyUnicode_Format(v, w);
14508
26.5M
}
14509
14510
static PyNumberMethods unicode_as_number = {
14511
    0,              /*nb_add*/
14512
    0,              /*nb_subtract*/
14513
    0,              /*nb_multiply*/
14514
    unicode_mod,            /*nb_remainder*/
14515
};
14516
14517
static PySequenceMethods unicode_as_sequence = {
14518
    unicode_length,     /* sq_length */
14519
    PyUnicode_Concat,   /* sq_concat */
14520
    unicode_repeat,     /* sq_repeat */
14521
    unicode_getitem,    /* sq_item */
14522
    0,                  /* sq_slice */
14523
    0,                  /* sq_ass_item */
14524
    0,                  /* sq_ass_slice */
14525
    PyUnicode_Contains, /* sq_contains */
14526
};
14527
14528
static PyObject*
14529
unicode_subscript(PyObject* self, PyObject* item)
14530
152M
{
14531
152M
    if (_PyIndex_Check(item)) {
14532
58.5M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14533
58.5M
        if (i == -1 && PyErr_Occurred())
14534
0
            return NULL;
14535
58.5M
        if (i < 0)
14536
48.8k
            i += PyUnicode_GET_LENGTH(self);
14537
58.5M
        return unicode_getitem(self, i);
14538
94.2M
    } else if (PySlice_Check(item)) {
14539
94.2M
        Py_ssize_t start, stop, step, slicelength, i;
14540
94.2M
        size_t cur;
14541
94.2M
        PyObject *result;
14542
94.2M
        const void *src_data;
14543
94.2M
        void *dest_data;
14544
94.2M
        int src_kind, dest_kind;
14545
94.2M
        Py_UCS4 ch, max_char, kind_limit;
14546
14547
94.2M
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14548
0
            return NULL;
14549
0
        }
14550
94.2M
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14551
94.2M
                                            &start, &stop, step);
14552
14553
94.2M
        if (slicelength <= 0) {
14554
18.2M
            _Py_RETURN_UNICODE_EMPTY();
14555
75.9M
        } else if (start == 0 && step == 1 &&
14556
75.9M
                   slicelength == PyUnicode_GET_LENGTH(self)) {
14557
7.30M
            return unicode_result_unchanged(self);
14558
68.6M
        } else if (step == 1) {
14559
68.6M
            return PyUnicode_Substring(self,
14560
68.6M
                                       start, start + slicelength);
14561
68.6M
        }
14562
        /* General case */
14563
0
        src_kind = PyUnicode_KIND(self);
14564
0
        src_data = PyUnicode_DATA(self);
14565
0
        if (!PyUnicode_IS_ASCII(self)) {
14566
0
            kind_limit = kind_maxchar_limit(src_kind);
14567
0
            max_char = 0;
14568
0
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14569
0
                ch = PyUnicode_READ(src_kind, src_data, cur);
14570
0
                if (ch > max_char) {
14571
0
                    max_char = ch;
14572
0
                    if (max_char >= kind_limit)
14573
0
                        break;
14574
0
                }
14575
0
            }
14576
0
        }
14577
0
        else
14578
0
            max_char = 127;
14579
0
        result = PyUnicode_New(slicelength, max_char);
14580
0
        if (result == NULL)
14581
0
            return NULL;
14582
0
        dest_kind = PyUnicode_KIND(result);
14583
0
        dest_data = PyUnicode_DATA(result);
14584
14585
0
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14586
0
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14587
0
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14588
0
        }
14589
0
        assert(_PyUnicode_CheckConsistency(result, 1));
14590
0
        return result;
14591
0
    } else {
14592
0
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
14593
0
                     Py_TYPE(item)->tp_name);
14594
0
        return NULL;
14595
0
    }
14596
152M
}
14597
14598
static PyMappingMethods unicode_as_mapping = {
14599
    unicode_length,     /* mp_length */
14600
    unicode_subscript,  /* mp_subscript */
14601
    0,                  /* mp_ass_subscript */
14602
};
14603
14604
14605
/* Helpers for PyUnicode_Format() */
14606
14607
struct unicode_formatter_t {
14608
    PyObject *args;
14609
    int args_owned;
14610
    Py_ssize_t arglen, argidx;
14611
    PyObject *dict;
14612
14613
    int fmtkind;
14614
    Py_ssize_t fmtcnt, fmtpos;
14615
    const void *fmtdata;
14616
    PyObject *fmtstr;
14617
14618
    _PyUnicodeWriter writer;
14619
};
14620
14621
struct unicode_format_arg_t {
14622
    Py_UCS4 ch;
14623
    int flags;
14624
    Py_ssize_t width;
14625
    int prec;
14626
    int sign;
14627
};
14628
14629
static PyObject *
14630
unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14631
51.7M
{
14632
51.7M
    Py_ssize_t argidx = ctx->argidx;
14633
14634
51.7M
    if (argidx < ctx->arglen) {
14635
51.7M
        ctx->argidx++;
14636
51.7M
        if (ctx->arglen < 0)
14637
20.2M
            return ctx->args;
14638
31.5M
        else
14639
31.5M
            return PyTuple_GetItem(ctx->args, argidx);
14640
51.7M
    }
14641
0
    PyErr_SetString(PyExc_TypeError,
14642
0
                    "not enough arguments for format string");
14643
0
    return NULL;
14644
51.7M
}
14645
14646
/* Returns a new reference to a PyUnicode object, or NULL on failure. */
14647
14648
/* Format a float into the writer if the writer is not NULL, or into *p_output
14649
   otherwise.
14650
14651
   Return 0 on success, raise an exception and return -1 on error. */
14652
static int
14653
formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14654
            PyObject **p_output,
14655
            _PyUnicodeWriter *writer)
14656
0
{
14657
0
    char *p;
14658
0
    double x;
14659
0
    Py_ssize_t len;
14660
0
    int prec;
14661
0
    int dtoa_flags = 0;
14662
14663
0
    x = PyFloat_AsDouble(v);
14664
0
    if (x == -1.0 && PyErr_Occurred())
14665
0
        return -1;
14666
14667
0
    prec = arg->prec;
14668
0
    if (prec < 0)
14669
0
        prec = 6;
14670
14671
0
    if (arg->flags & F_ALT)
14672
0
        dtoa_flags |= Py_DTSF_ALT;
14673
0
    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14674
0
    if (p == NULL)
14675
0
        return -1;
14676
0
    len = strlen(p);
14677
0
    if (writer) {
14678
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14679
0
            PyMem_Free(p);
14680
0
            return -1;
14681
0
        }
14682
0
    }
14683
0
    else
14684
0
        *p_output = _PyUnicode_FromASCII(p, len);
14685
0
    PyMem_Free(p);
14686
0
    return 0;
14687
0
}
14688
14689
/* formatlong() emulates the format codes d, u, o, x and X, and
14690
 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14691
 * Python's regular ints.
14692
 * Return value:  a new PyUnicodeObject*, or NULL if error.
14693
 *     The output string is of the form
14694
 *         "-"? ("0x" | "0X")? digit+
14695
 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14696
 *         set in flags.  The case of hex digits will be correct,
14697
 *     There will be at least prec digits, zero-filled on the left if
14698
 *         necessary to get that many.
14699
 * val          object to be converted
14700
 * flags        bitmask of format flags; only F_ALT is looked at
14701
 * prec         minimum number of digits; 0-fill on left if needed
14702
 * type         a character in [duoxX]; u acts the same as d
14703
 *
14704
 * CAUTION:  o, x and X conversions on regular ints can never
14705
 * produce a '-' sign, but can for Python's unbounded ints.
14706
 */
14707
PyObject *
14708
_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14709
1.53k
{
14710
1.53k
    PyObject *result = NULL;
14711
1.53k
    char *buf;
14712
1.53k
    Py_ssize_t i;
14713
1.53k
    int sign;           /* 1 if '-', else 0 */
14714
1.53k
    int len;            /* number of characters */
14715
1.53k
    Py_ssize_t llen;
14716
1.53k
    int numdigits;      /* len == numnondigits + numdigits */
14717
1.53k
    int numnondigits = 0;
14718
14719
    /* Avoid exceeding SSIZE_T_MAX */
14720
1.53k
    if (prec > INT_MAX-3) {
14721
0
        PyErr_SetString(PyExc_OverflowError,
14722
0
                        "precision too large");
14723
0
        return NULL;
14724
0
    }
14725
14726
1.53k
    assert(PyLong_Check(val));
14727
14728
1.53k
    switch (type) {
14729
0
    default:
14730
0
        Py_UNREACHABLE();
14731
0
    case 'd':
14732
0
    case 'i':
14733
0
    case 'u':
14734
        /* int and int subclasses should print numerically when a numeric */
14735
        /* format code is used (see issue18780) */
14736
0
        result = PyNumber_ToBase(val, 10);
14737
0
        break;
14738
0
    case 'o':
14739
0
        numnondigits = 2;
14740
0
        result = PyNumber_ToBase(val, 8);
14741
0
        break;
14742
0
    case 'x':
14743
1.53k
    case 'X':
14744
1.53k
        numnondigits = 2;
14745
1.53k
        result = PyNumber_ToBase(val, 16);
14746
1.53k
        break;
14747
1.53k
    }
14748
1.53k
    if (!result)
14749
0
        return NULL;
14750
14751
1.53k
    assert(unicode_modifiable(result));
14752
1.53k
    assert(PyUnicode_IS_ASCII(result));
14753
14754
    /* To modify the string in-place, there can only be one reference. */
14755
1.53k
    if (!_PyObject_IsUniquelyReferenced(result)) {
14756
0
        Py_DECREF(result);
14757
0
        PyErr_BadInternalCall();
14758
0
        return NULL;
14759
0
    }
14760
1.53k
    buf = PyUnicode_DATA(result);
14761
1.53k
    llen = PyUnicode_GET_LENGTH(result);
14762
1.53k
    if (llen > INT_MAX) {
14763
0
        Py_DECREF(result);
14764
0
        PyErr_SetString(PyExc_ValueError,
14765
0
                        "string too large in _PyUnicode_FormatLong");
14766
0
        return NULL;
14767
0
    }
14768
1.53k
    len = (int)llen;
14769
1.53k
    sign = buf[0] == '-';
14770
1.53k
    numnondigits += sign;
14771
1.53k
    numdigits = len - numnondigits;
14772
1.53k
    assert(numdigits > 0);
14773
14774
    /* Get rid of base marker unless F_ALT */
14775
1.53k
    if (((alt) == 0 &&
14776
1.53k
        (type == 'o' || type == 'x' || type == 'X'))) {
14777
1.53k
        assert(buf[sign] == '0');
14778
1.53k
        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14779
1.53k
               buf[sign+1] == 'o');
14780
1.53k
        numnondigits -= 2;
14781
1.53k
        buf += 2;
14782
1.53k
        len -= 2;
14783
1.53k
        if (sign)
14784
0
            buf[0] = '-';
14785
1.53k
        assert(len == numnondigits + numdigits);
14786
1.53k
        assert(numdigits > 0);
14787
1.53k
    }
14788
14789
    /* Fill with leading zeroes to meet minimum width. */
14790
1.53k
    if (prec > numdigits) {
14791
0
        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14792
0
                                numnondigits + prec);
14793
0
        char *b1;
14794
0
        if (!r1) {
14795
0
            Py_DECREF(result);
14796
0
            return NULL;
14797
0
        }
14798
0
        b1 = PyBytes_AS_STRING(r1);
14799
0
        for (i = 0; i < numnondigits; ++i)
14800
0
            *b1++ = *buf++;
14801
0
        for (i = 0; i < prec - numdigits; i++)
14802
0
            *b1++ = '0';
14803
0
        for (i = 0; i < numdigits; i++)
14804
0
            *b1++ = *buf++;
14805
0
        *b1 = '\0';
14806
0
        Py_SETREF(result, r1);
14807
0
        buf = PyBytes_AS_STRING(result);
14808
0
        len = numnondigits + prec;
14809
0
    }
14810
14811
    /* Fix up case for hex conversions. */
14812
1.53k
    if (type == 'X') {
14813
        /* Need to convert all lower case letters to upper case.
14814
           and need to convert 0x to 0X (and -0x to -0X). */
14815
4.51k
        for (i = 0; i < len; i++)
14816
2.97k
            if (buf[i] >= 'a' && buf[i] <= 'x')
14817
1.15k
                buf[i] -= 'a'-'A';
14818
1.53k
    }
14819
1.53k
    if (!PyUnicode_Check(result)
14820
1.53k
        || buf != PyUnicode_DATA(result)) {
14821
1.53k
        PyObject *unicode;
14822
1.53k
        unicode = _PyUnicode_FromASCII(buf, len);
14823
1.53k
        Py_SETREF(result, unicode);
14824
1.53k
    }
14825
0
    else if (len != PyUnicode_GET_LENGTH(result)) {
14826
0
        if (PyUnicode_Resize(&result, len) < 0)
14827
0
            Py_CLEAR(result);
14828
0
    }
14829
1.53k
    return result;
14830
1.53k
}
14831
14832
/* Format an integer or a float as an integer.
14833
 * Return 1 if the number has been formatted into the writer,
14834
 *        0 if the number has been formatted into *p_output
14835
 *       -1 and raise an exception on error */
14836
static int
14837
mainformatlong(PyObject *v,
14838
               struct unicode_format_arg_t *arg,
14839
               PyObject **p_output,
14840
               _PyUnicodeWriter *writer)
14841
12.6M
{
14842
12.6M
    PyObject *iobj, *res;
14843
12.6M
    char type = (char)arg->ch;
14844
14845
12.6M
    if (!PyNumber_Check(v))
14846
5.16M
        goto wrongtype;
14847
14848
    /* make sure number is a type of integer for o, x, and X */
14849
7.48M
    if (!PyLong_Check(v)) {
14850
0
        if (type == 'o' || type == 'x' || type == 'X') {
14851
0
            iobj = _PyNumber_Index(v);
14852
0
        }
14853
0
        else {
14854
0
            iobj = PyNumber_Long(v);
14855
0
        }
14856
0
        if (iobj == NULL ) {
14857
0
            if (PyErr_ExceptionMatches(PyExc_TypeError))
14858
0
                goto wrongtype;
14859
0
            return -1;
14860
0
        }
14861
0
        assert(PyLong_Check(iobj));
14862
0
    }
14863
7.48M
    else {
14864
7.48M
        iobj = Py_NewRef(v);
14865
7.48M
    }
14866
14867
7.48M
    if (PyLong_CheckExact(v)
14868
7.48M
        && arg->width == -1 && arg->prec == -1
14869
7.48M
        && !(arg->flags & (F_SIGN | F_BLANK))
14870
7.48M
        && type != 'X')
14871
7.48M
    {
14872
        /* Fast path */
14873
7.48M
        int alternate = arg->flags & F_ALT;
14874
7.48M
        int base;
14875
14876
7.48M
        switch(type)
14877
7.48M
        {
14878
0
            default:
14879
0
                Py_UNREACHABLE();
14880
7.48M
            case 'd':
14881
7.48M
            case 'i':
14882
7.48M
            case 'u':
14883
7.48M
                base = 10;
14884
7.48M
                break;
14885
0
            case 'o':
14886
0
                base = 8;
14887
0
                break;
14888
0
            case 'x':
14889
0
            case 'X':
14890
0
                base = 16;
14891
0
                break;
14892
7.48M
        }
14893
14894
7.48M
        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14895
0
            Py_DECREF(iobj);
14896
0
            return -1;
14897
0
        }
14898
7.48M
        Py_DECREF(iobj);
14899
7.48M
        return 1;
14900
7.48M
    }
14901
14902
1.53k
    res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14903
1.53k
    Py_DECREF(iobj);
14904
1.53k
    if (res == NULL)
14905
0
        return -1;
14906
1.53k
    *p_output = res;
14907
1.53k
    return 0;
14908
14909
5.16M
wrongtype:
14910
5.16M
    switch(type)
14911
5.16M
    {
14912
0
        case 'o':
14913
0
        case 'x':
14914
0
        case 'X':
14915
0
            PyErr_Format(PyExc_TypeError,
14916
0
                    "%%%c format: an integer is required, "
14917
0
                    "not %.200s",
14918
0
                    type, Py_TYPE(v)->tp_name);
14919
0
            break;
14920
5.16M
        default:
14921
5.16M
            PyErr_Format(PyExc_TypeError,
14922
5.16M
                    "%%%c format: a real number is required, "
14923
5.16M
                    "not %.200s",
14924
5.16M
                    type, Py_TYPE(v)->tp_name);
14925
5.16M
            break;
14926
5.16M
    }
14927
5.16M
    return -1;
14928
5.16M
}
14929
14930
static Py_UCS4
14931
formatchar(PyObject *v)
14932
0
{
14933
    /* presume that the buffer is at least 3 characters long */
14934
0
    if (PyUnicode_Check(v)) {
14935
0
        if (PyUnicode_GET_LENGTH(v) == 1) {
14936
0
            return PyUnicode_READ_CHAR(v, 0);
14937
0
        }
14938
0
        PyErr_Format(PyExc_TypeError,
14939
0
                     "%%c requires an int or a unicode character, "
14940
0
                     "not a string of length %zd",
14941
0
                     PyUnicode_GET_LENGTH(v));
14942
0
        return (Py_UCS4) -1;
14943
0
    }
14944
0
    else {
14945
0
        int overflow;
14946
0
        long x = PyLong_AsLongAndOverflow(v, &overflow);
14947
0
        if (x == -1 && PyErr_Occurred()) {
14948
0
            if (PyErr_ExceptionMatches(PyExc_TypeError)) {
14949
0
                PyErr_Format(PyExc_TypeError,
14950
0
                             "%%c requires an int or a unicode character, not %T",
14951
0
                             v);
14952
0
                return (Py_UCS4) -1;
14953
0
            }
14954
0
            return (Py_UCS4) -1;
14955
0
        }
14956
14957
0
        if (x < 0 || x > MAX_UNICODE) {
14958
            /* this includes an overflow in converting to C long */
14959
0
            PyErr_SetString(PyExc_OverflowError,
14960
0
                            "%c arg not in range(0x110000)");
14961
0
            return (Py_UCS4) -1;
14962
0
        }
14963
14964
0
        return (Py_UCS4) x;
14965
0
    }
14966
0
}
14967
14968
/* Parse options of an argument: flags, width, precision.
14969
   Handle also "%(name)" syntax.
14970
14971
   Return 0 if the argument has been formatted into arg->str.
14972
   Return 1 if the argument has been written into ctx->writer,
14973
   Raise an exception and return -1 on error. */
14974
static int
14975
unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14976
                         struct unicode_format_arg_t *arg)
14977
51.7M
{
14978
51.7M
#define FORMAT_READ(ctx) \
14979
52.0M
        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14980
14981
51.7M
    PyObject *v;
14982
14983
51.7M
    if (arg->ch == '(') {
14984
        /* Get argument value from a dictionary. Example: "%(name)s". */
14985
38.6k
        Py_ssize_t keystart;
14986
38.6k
        Py_ssize_t keylen;
14987
38.6k
        PyObject *key;
14988
38.6k
        int pcount = 1;
14989
14990
38.6k
        if (ctx->dict == NULL) {
14991
0
            PyErr_SetString(PyExc_TypeError,
14992
0
                            "format requires a mapping");
14993
0
            return -1;
14994
0
        }
14995
38.6k
        ++ctx->fmtpos;
14996
38.6k
        --ctx->fmtcnt;
14997
38.6k
        keystart = ctx->fmtpos;
14998
        /* Skip over balanced parentheses */
14999
347k
        while (pcount > 0 && --ctx->fmtcnt >= 0) {
15000
309k
            arg->ch = FORMAT_READ(ctx);
15001
309k
            if (arg->ch == ')')
15002
38.6k
                --pcount;
15003
270k
            else if (arg->ch == '(')
15004
0
                ++pcount;
15005
309k
            ctx->fmtpos++;
15006
309k
        }
15007
38.6k
        keylen = ctx->fmtpos - keystart - 1;
15008
38.6k
        if (ctx->fmtcnt < 0 || pcount > 0) {
15009
0
            PyErr_SetString(PyExc_ValueError,
15010
0
                            "incomplete format key");
15011
0
            return -1;
15012
0
        }
15013
38.6k
        key = PyUnicode_Substring(ctx->fmtstr,
15014
38.6k
                                  keystart, keystart + keylen);
15015
38.6k
        if (key == NULL)
15016
0
            return -1;
15017
38.6k
        if (ctx->args_owned) {
15018
27.6k
            ctx->args_owned = 0;
15019
27.6k
            Py_DECREF(ctx->args);
15020
27.6k
        }
15021
38.6k
        ctx->args = PyObject_GetItem(ctx->dict, key);
15022
38.6k
        Py_DECREF(key);
15023
38.6k
        if (ctx->args == NULL)
15024
0
            return -1;
15025
38.6k
        ctx->args_owned = 1;
15026
38.6k
        ctx->arglen = -1;
15027
38.6k
        ctx->argidx = -2;
15028
38.6k
    }
15029
15030
    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
15031
51.7M
    while (--ctx->fmtcnt >= 0) {
15032
51.7M
        arg->ch = FORMAT_READ(ctx);
15033
51.7M
        ctx->fmtpos++;
15034
51.7M
        switch (arg->ch) {
15035
0
        case '-': arg->flags |= F_LJUST; continue;
15036
0
        case '+': arg->flags |= F_SIGN; continue;
15037
0
        case ' ': arg->flags |= F_BLANK; continue;
15038
0
        case '#': arg->flags |= F_ALT; continue;
15039
1.53k
        case '0': arg->flags |= F_ZERO; continue;
15040
51.7M
        }
15041
51.7M
        break;
15042
51.7M
    }
15043
15044
    /* Parse width. Example: "%10s" => width=10 */
15045
51.7M
    if (arg->ch == '*') {
15046
0
        v = unicode_format_getnextarg(ctx);
15047
0
        if (v == NULL)
15048
0
            return -1;
15049
0
        if (!PyLong_Check(v)) {
15050
0
            PyErr_SetString(PyExc_TypeError,
15051
0
                            "* wants int");
15052
0
            return -1;
15053
0
        }
15054
0
        arg->width = PyLong_AsSsize_t(v);
15055
0
        if (arg->width == -1 && PyErr_Occurred())
15056
0
            return -1;
15057
0
        if (arg->width < 0) {
15058
0
            arg->flags |= F_LJUST;
15059
0
            arg->width = -arg->width;
15060
0
        }
15061
0
        if (--ctx->fmtcnt >= 0) {
15062
0
            arg->ch = FORMAT_READ(ctx);
15063
0
            ctx->fmtpos++;
15064
0
        }
15065
0
    }
15066
51.7M
    else if (arg->ch >= '0' && arg->ch <= '9') {
15067
1.53k
        arg->width = arg->ch - '0';
15068
1.53k
        while (--ctx->fmtcnt >= 0) {
15069
1.53k
            arg->ch = FORMAT_READ(ctx);
15070
1.53k
            ctx->fmtpos++;
15071
1.53k
            if (arg->ch < '0' || arg->ch > '9')
15072
1.53k
                break;
15073
            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
15074
               mixing signed and unsigned comparison. Since arg->ch is between
15075
               '0' and '9', casting to int is safe. */
15076
0
            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
15077
0
                PyErr_SetString(PyExc_ValueError,
15078
0
                                "width too big");
15079
0
                return -1;
15080
0
            }
15081
0
            arg->width = arg->width*10 + (arg->ch - '0');
15082
0
        }
15083
1.53k
    }
15084
15085
    /* Parse precision. Example: "%.3f" => prec=3 */
15086
51.7M
    if (arg->ch == '.') {
15087
0
        arg->prec = 0;
15088
0
        if (--ctx->fmtcnt >= 0) {
15089
0
            arg->ch = FORMAT_READ(ctx);
15090
0
            ctx->fmtpos++;
15091
0
        }
15092
0
        if (arg->ch == '*') {
15093
0
            v = unicode_format_getnextarg(ctx);
15094
0
            if (v == NULL)
15095
0
                return -1;
15096
0
            if (!PyLong_Check(v)) {
15097
0
                PyErr_SetString(PyExc_TypeError,
15098
0
                                "* wants int");
15099
0
                return -1;
15100
0
            }
15101
0
            arg->prec = PyLong_AsInt(v);
15102
0
            if (arg->prec == -1 && PyErr_Occurred())
15103
0
                return -1;
15104
0
            if (arg->prec < 0)
15105
0
                arg->prec = 0;
15106
0
            if (--ctx->fmtcnt >= 0) {
15107
0
                arg->ch = FORMAT_READ(ctx);
15108
0
                ctx->fmtpos++;
15109
0
            }
15110
0
        }
15111
0
        else if (arg->ch >= '0' && arg->ch <= '9') {
15112
0
            arg->prec = arg->ch - '0';
15113
0
            while (--ctx->fmtcnt >= 0) {
15114
0
                arg->ch = FORMAT_READ(ctx);
15115
0
                ctx->fmtpos++;
15116
0
                if (arg->ch < '0' || arg->ch > '9')
15117
0
                    break;
15118
0
                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15119
0
                    PyErr_SetString(PyExc_ValueError,
15120
0
                                    "precision too big");
15121
0
                    return -1;
15122
0
                }
15123
0
                arg->prec = arg->prec*10 + (arg->ch - '0');
15124
0
            }
15125
0
        }
15126
0
    }
15127
15128
    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15129
51.7M
    if (ctx->fmtcnt >= 0) {
15130
51.7M
        if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15131
0
            if (--ctx->fmtcnt >= 0) {
15132
0
                arg->ch = FORMAT_READ(ctx);
15133
0
                ctx->fmtpos++;
15134
0
            }
15135
0
        }
15136
51.7M
    }
15137
51.7M
    if (ctx->fmtcnt < 0) {
15138
0
        PyErr_SetString(PyExc_ValueError,
15139
0
                        "incomplete format");
15140
0
        return -1;
15141
0
    }
15142
51.7M
    return 0;
15143
15144
51.7M
#undef FORMAT_READ
15145
51.7M
}
15146
15147
/* Format one argument. Supported conversion specifiers:
15148
15149
   - "s", "r", "a": any type
15150
   - "i", "d", "u": int or float
15151
   - "o", "x", "X": int
15152
   - "e", "E", "f", "F", "g", "G": float
15153
   - "c": int or str (1 character)
15154
15155
   When possible, the output is written directly into the Unicode writer
15156
   (ctx->writer). A string is created when padding is required.
15157
15158
   Return 0 if the argument has been formatted into *p_str,
15159
          1 if the argument has been written into ctx->writer,
15160
         -1 on error. */
15161
static int
15162
unicode_format_arg_format(struct unicode_formatter_t *ctx,
15163
                          struct unicode_format_arg_t *arg,
15164
                          PyObject **p_str)
15165
51.7M
{
15166
51.7M
    PyObject *v;
15167
51.7M
    _PyUnicodeWriter *writer = &ctx->writer;
15168
15169
51.7M
    if (ctx->fmtcnt == 0)
15170
12.5M
        ctx->writer.overallocate = 0;
15171
15172
51.7M
    v = unicode_format_getnextarg(ctx);
15173
51.7M
    if (v == NULL)
15174
0
        return -1;
15175
15176
15177
51.7M
    switch (arg->ch) {
15178
39.1M
    case 's':
15179
39.1M
    case 'r':
15180
39.1M
    case 'a':
15181
39.1M
        if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15182
            /* Fast path */
15183
0
            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15184
0
                return -1;
15185
0
            return 1;
15186
0
        }
15187
15188
39.1M
        if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15189
39.1M
            *p_str = Py_NewRef(v);
15190
39.1M
        }
15191
0
        else {
15192
0
            if (arg->ch == 's')
15193
0
                *p_str = PyObject_Str(v);
15194
0
            else if (arg->ch == 'r')
15195
0
                *p_str = PyObject_Repr(v);
15196
0
            else
15197
0
                *p_str = PyObject_ASCII(v);
15198
0
        }
15199
39.1M
        break;
15200
15201
0
    case 'i':
15202
12.6M
    case 'd':
15203
12.6M
    case 'u':
15204
12.6M
    case 'o':
15205
12.6M
    case 'x':
15206
12.6M
    case 'X':
15207
12.6M
    {
15208
12.6M
        int ret = mainformatlong(v, arg, p_str, writer);
15209
12.6M
        if (ret != 0)
15210
12.6M
            return ret;
15211
1.53k
        arg->sign = 1;
15212
1.53k
        break;
15213
12.6M
    }
15214
15215
0
    case 'e':
15216
0
    case 'E':
15217
0
    case 'f':
15218
0
    case 'F':
15219
0
    case 'g':
15220
0
    case 'G':
15221
0
        if (arg->width == -1 && arg->prec == -1
15222
0
            && !(arg->flags & (F_SIGN | F_BLANK)))
15223
0
        {
15224
            /* Fast path */
15225
0
            if (formatfloat(v, arg, NULL, writer) == -1)
15226
0
                return -1;
15227
0
            return 1;
15228
0
        }
15229
15230
0
        arg->sign = 1;
15231
0
        if (formatfloat(v, arg, p_str, NULL) == -1)
15232
0
            return -1;
15233
0
        break;
15234
15235
0
    case 'c':
15236
0
    {
15237
0
        Py_UCS4 ch = formatchar(v);
15238
0
        if (ch == (Py_UCS4) -1)
15239
0
            return -1;
15240
0
        if (arg->width == -1 && arg->prec == -1) {
15241
            /* Fast path */
15242
0
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
15243
0
                return -1;
15244
0
            return 1;
15245
0
        }
15246
0
        *p_str = PyUnicode_FromOrdinal(ch);
15247
0
        break;
15248
0
    }
15249
15250
0
    default:
15251
0
        PyErr_Format(PyExc_ValueError,
15252
0
                     "unsupported format character '%c' (0x%x) "
15253
0
                     "at index %zd",
15254
0
                     (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15255
0
                     (int)arg->ch,
15256
0
                     ctx->fmtpos - 1);
15257
0
        return -1;
15258
51.7M
    }
15259
39.1M
    if (*p_str == NULL)
15260
0
        return -1;
15261
39.1M
    assert (PyUnicode_Check(*p_str));
15262
39.1M
    return 0;
15263
39.1M
}
15264
15265
static int
15266
unicode_format_arg_output(struct unicode_formatter_t *ctx,
15267
                          struct unicode_format_arg_t *arg,
15268
                          PyObject *str)
15269
39.1M
{
15270
39.1M
    Py_ssize_t len;
15271
39.1M
    int kind;
15272
39.1M
    const void *pbuf;
15273
39.1M
    Py_ssize_t pindex;
15274
39.1M
    Py_UCS4 signchar;
15275
39.1M
    Py_ssize_t buflen;
15276
39.1M
    Py_UCS4 maxchar;
15277
39.1M
    Py_ssize_t sublen;
15278
39.1M
    _PyUnicodeWriter *writer = &ctx->writer;
15279
39.1M
    Py_UCS4 fill;
15280
15281
39.1M
    fill = ' ';
15282
39.1M
    if (arg->sign && arg->flags & F_ZERO)
15283
1.53k
        fill = '0';
15284
15285
39.1M
    len = PyUnicode_GET_LENGTH(str);
15286
39.1M
    if ((arg->width == -1 || arg->width <= len)
15287
39.1M
        && (arg->prec == -1 || arg->prec >= len)
15288
39.1M
        && !(arg->flags & (F_SIGN | F_BLANK)))
15289
39.1M
    {
15290
        /* Fast path */
15291
39.1M
        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15292
0
            return -1;
15293
39.1M
        return 0;
15294
39.1M
    }
15295
15296
    /* Truncate the string for "s", "r" and "a" formats
15297
       if the precision is set */
15298
96
    if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15299
0
        if (arg->prec >= 0 && len > arg->prec)
15300
0
            len = arg->prec;
15301
0
    }
15302
15303
    /* Adjust sign and width */
15304
96
    kind = PyUnicode_KIND(str);
15305
96
    pbuf = PyUnicode_DATA(str);
15306
96
    pindex = 0;
15307
96
    signchar = '\0';
15308
96
    if (arg->sign) {
15309
96
        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15310
96
        if (ch == '-' || ch == '+') {
15311
0
            signchar = ch;
15312
0
            len--;
15313
0
            pindex++;
15314
0
        }
15315
96
        else if (arg->flags & F_SIGN)
15316
0
            signchar = '+';
15317
96
        else if (arg->flags & F_BLANK)
15318
0
            signchar = ' ';
15319
96
        else
15320
96
            arg->sign = 0;
15321
96
    }
15322
96
    if (arg->width < len)
15323
0
        arg->width = len;
15324
15325
    /* Prepare the writer */
15326
96
    maxchar = writer->maxchar;
15327
96
    if (!(arg->flags & F_LJUST)) {
15328
96
        if (arg->sign) {
15329
0
            if ((arg->width-1) > len)
15330
0
                maxchar = Py_MAX(maxchar, fill);
15331
0
        }
15332
96
        else {
15333
96
            if (arg->width > len)
15334
96
                maxchar = Py_MAX(maxchar, fill);
15335
96
        }
15336
96
    }
15337
96
    if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15338
0
        Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
15339
0
        maxchar = Py_MAX(maxchar, strmaxchar);
15340
0
    }
15341
15342
96
    buflen = arg->width;
15343
96
    if (arg->sign && len == arg->width)
15344
0
        buflen++;
15345
96
    if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
15346
0
        return -1;
15347
15348
    /* Write the sign if needed */
15349
96
    if (arg->sign) {
15350
0
        if (fill != ' ') {
15351
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15352
0
            writer->pos += 1;
15353
0
        }
15354
0
        if (arg->width > len)
15355
0
            arg->width--;
15356
0
    }
15357
15358
    /* Write the numeric prefix for "x", "X" and "o" formats
15359
       if the alternate form is used.
15360
       For example, write "0x" for the "%#x" format. */
15361
96
    if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15362
0
        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15363
0
        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15364
0
        if (fill != ' ') {
15365
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15366
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15367
0
            writer->pos += 2;
15368
0
            pindex += 2;
15369
0
        }
15370
0
        arg->width -= 2;
15371
0
        if (arg->width < 0)
15372
0
            arg->width = 0;
15373
0
        len -= 2;
15374
0
    }
15375
15376
    /* Pad left with the fill character if needed */
15377
96
    if (arg->width > len && !(arg->flags & F_LJUST)) {
15378
96
        sublen = arg->width - len;
15379
96
        unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
15380
96
        writer->pos += sublen;
15381
96
        arg->width = len;
15382
96
    }
15383
15384
    /* If padding with spaces: write sign if needed and/or numeric prefix if
15385
       the alternate form is used */
15386
96
    if (fill == ' ') {
15387
0
        if (arg->sign) {
15388
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15389
0
            writer->pos += 1;
15390
0
        }
15391
0
        if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15392
0
            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15393
0
            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15394
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15395
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15396
0
            writer->pos += 2;
15397
0
            pindex += 2;
15398
0
        }
15399
0
    }
15400
15401
    /* Write characters */
15402
96
    if (len) {
15403
96
        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15404
96
                                      str, pindex, len);
15405
96
        writer->pos += len;
15406
96
    }
15407
15408
    /* Pad right with the fill character if needed */
15409
96
    if (arg->width > len) {
15410
0
        sublen = arg->width - len;
15411
0
        unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
15412
0
        writer->pos += sublen;
15413
0
    }
15414
96
    return 0;
15415
96
}
15416
15417
/* Helper of PyUnicode_Format(): format one arg.
15418
   Return 0 on success, raise an exception and return -1 on error. */
15419
static int
15420
unicode_format_arg(struct unicode_formatter_t *ctx)
15421
51.7M
{
15422
51.7M
    struct unicode_format_arg_t arg;
15423
51.7M
    PyObject *str;
15424
51.7M
    int ret;
15425
15426
51.7M
    arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
15427
51.7M
    if (arg.ch == '%') {
15428
0
        ctx->fmtpos++;
15429
0
        ctx->fmtcnt--;
15430
0
        if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15431
0
            return -1;
15432
0
        return 0;
15433
0
    }
15434
51.7M
    arg.flags = 0;
15435
51.7M
    arg.width = -1;
15436
51.7M
    arg.prec = -1;
15437
51.7M
    arg.sign = 0;
15438
51.7M
    str = NULL;
15439
15440
51.7M
    ret = unicode_format_arg_parse(ctx, &arg);
15441
51.7M
    if (ret == -1)
15442
0
        return -1;
15443
15444
51.7M
    ret = unicode_format_arg_format(ctx, &arg, &str);
15445
51.7M
    if (ret == -1)
15446
5.16M
        return -1;
15447
15448
46.6M
    if (ret != 1) {
15449
39.1M
        ret = unicode_format_arg_output(ctx, &arg, str);
15450
39.1M
        Py_DECREF(str);
15451
39.1M
        if (ret == -1)
15452
0
            return -1;
15453
39.1M
    }
15454
15455
46.6M
    if (ctx->dict && (ctx->argidx < ctx->arglen)) {
15456
0
        PyErr_SetString(PyExc_TypeError,
15457
0
                        "not all arguments converted during string formatting");
15458
0
        return -1;
15459
0
    }
15460
46.6M
    return 0;
15461
46.6M
}
15462
15463
PyObject *
15464
PyUnicode_Format(PyObject *format, PyObject *args)
15465
26.5M
{
15466
26.5M
    struct unicode_formatter_t ctx;
15467
15468
26.5M
    if (format == NULL || args == NULL) {
15469
0
        PyErr_BadInternalCall();
15470
0
        return NULL;
15471
0
    }
15472
15473
26.5M
    if (ensure_unicode(format) < 0)
15474
0
        return NULL;
15475
15476
26.5M
    ctx.fmtstr = format;
15477
26.5M
    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15478
26.5M
    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15479
26.5M
    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15480
26.5M
    ctx.fmtpos = 0;
15481
15482
26.5M
    _PyUnicodeWriter_Init(&ctx.writer);
15483
26.5M
    ctx.writer.min_length = ctx.fmtcnt + 100;
15484
26.5M
    ctx.writer.overallocate = 1;
15485
15486
26.5M
    if (PyTuple_Check(args)) {
15487
6.37M
        ctx.arglen = PyTuple_Size(args);
15488
6.37M
        ctx.argidx = 0;
15489
6.37M
    }
15490
20.2M
    else {
15491
20.2M
        ctx.arglen = -1;
15492
20.2M
        ctx.argidx = -2;
15493
20.2M
    }
15494
26.5M
    ctx.args_owned = 0;
15495
26.5M
    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
15496
11.0k
        ctx.dict = args;
15497
26.5M
    else
15498
26.5M
        ctx.dict = NULL;
15499
26.5M
    ctx.args = args;
15500
15501
126M
    while (--ctx.fmtcnt >= 0) {
15502
105M
        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15503
53.2M
            Py_ssize_t nonfmtpos;
15504
15505
53.2M
            nonfmtpos = ctx.fmtpos++;
15506
519M
            while (ctx.fmtcnt >= 0 &&
15507
519M
                   PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15508
465M
                ctx.fmtpos++;
15509
465M
                ctx.fmtcnt--;
15510
465M
            }
15511
53.2M
            if (ctx.fmtcnt < 0) {
15512
14.0M
                ctx.fmtpos--;
15513
14.0M
                ctx.writer.overallocate = 0;
15514
14.0M
            }
15515
15516
53.2M
            if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15517
53.2M
                                                nonfmtpos, ctx.fmtpos) < 0)
15518
0
                goto onError;
15519
53.2M
        }
15520
51.7M
        else {
15521
51.7M
            ctx.fmtpos++;
15522
51.7M
            if (unicode_format_arg(&ctx) == -1)
15523
5.16M
                goto onError;
15524
51.7M
        }
15525
105M
    }
15526
15527
21.4M
    if (ctx.argidx < ctx.arglen && !ctx.dict) {
15528
0
        PyErr_SetString(PyExc_TypeError,
15529
0
                        "not all arguments converted during string formatting");
15530
0
        goto onError;
15531
0
    }
15532
15533
21.4M
    if (ctx.args_owned) {
15534
11.0k
        Py_DECREF(ctx.args);
15535
11.0k
    }
15536
21.4M
    return _PyUnicodeWriter_Finish(&ctx.writer);
15537
15538
5.16M
  onError:
15539
5.16M
    _PyUnicodeWriter_Dealloc(&ctx.writer);
15540
5.16M
    if (ctx.args_owned) {
15541
0
        Py_DECREF(ctx.args);
15542
0
    }
15543
5.16M
    return NULL;
15544
21.4M
}
15545
15546
static PyObject *
15547
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15548
15549
/*[clinic input]
15550
@classmethod
15551
str.__new__ as unicode_new
15552
15553
    object as x: object = NULL
15554
    encoding: str = NULL
15555
    errors: str = NULL
15556
15557
[clinic start generated code]*/
15558
15559
static PyObject *
15560
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15561
                 const char *errors)
15562
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
15563
10.4M
{
15564
10.4M
    PyObject *unicode;
15565
10.4M
    if (x == NULL) {
15566
0
        unicode = unicode_get_empty();
15567
0
    }
15568
10.4M
    else if (encoding == NULL && errors == NULL) {
15569
10.4M
        unicode = PyObject_Str(x);
15570
10.4M
    }
15571
0
    else {
15572
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15573
0
    }
15574
15575
10.4M
    if (unicode != NULL && type != &PyUnicode_Type) {
15576
10.4M
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15577
10.4M
    }
15578
10.4M
    return unicode;
15579
10.4M
}
15580
15581
static const char *
15582
arg_as_utf8(PyObject *obj, const char *name)
15583
1.07M
{
15584
1.07M
    if (!PyUnicode_Check(obj)) {
15585
0
        PyErr_Format(PyExc_TypeError,
15586
0
                     "str() argument '%s' must be str, not %T",
15587
0
                     name, obj);
15588
0
        return NULL;
15589
0
    }
15590
1.07M
    return _PyUnicode_AsUTF8NoNUL(obj);
15591
1.07M
}
15592
15593
static PyObject *
15594
unicode_vectorcall(PyObject *type, PyObject *const *args,
15595
                   size_t nargsf, PyObject *kwnames)
15596
848k
{
15597
848k
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
15598
15599
848k
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
15600
848k
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
15601
        // Fallback to unicode_new()
15602
0
        PyObject *tuple = _PyTuple_FromArray(args, nargs);
15603
0
        if (tuple == NULL) {
15604
0
            return NULL;
15605
0
        }
15606
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
15607
0
        if (dict == NULL) {
15608
0
            Py_DECREF(tuple);
15609
0
            return NULL;
15610
0
        }
15611
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
15612
0
        Py_DECREF(tuple);
15613
0
        Py_DECREF(dict);
15614
0
        return ret;
15615
0
    }
15616
848k
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
15617
0
        return NULL;
15618
0
    }
15619
848k
    if (nargs == 0) {
15620
0
        return unicode_get_empty();
15621
0
    }
15622
848k
    PyObject *object = args[0];
15623
848k
    if (nargs == 1) {
15624
298
        return PyObject_Str(object);
15625
298
    }
15626
848k
    const char *encoding = arg_as_utf8(args[1], "encoding");
15627
848k
    if (encoding == NULL) {
15628
160
        return NULL;
15629
160
    }
15630
848k
    const char *errors = NULL;
15631
848k
    if (nargs == 3) {
15632
222k
        errors = arg_as_utf8(args[2], "errors");
15633
222k
        if (errors == NULL) {
15634
0
            return NULL;
15635
0
        }
15636
222k
    }
15637
848k
    return PyUnicode_FromEncodedObject(object, encoding, errors);
15638
848k
}
15639
15640
static PyObject *
15641
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
15642
10.4M
{
15643
10.4M
    PyObject *self;
15644
10.4M
    Py_ssize_t length, char_size;
15645
10.4M
    int share_utf8;
15646
10.4M
    int kind;
15647
10.4M
    void *data;
15648
15649
10.4M
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
15650
10.4M
    assert(_PyUnicode_CHECK(unicode));
15651
15652
10.4M
    self = type->tp_alloc(type, 0);
15653
10.4M
    if (self == NULL) {
15654
0
        return NULL;
15655
0
    }
15656
10.4M
    kind = PyUnicode_KIND(unicode);
15657
10.4M
    length = PyUnicode_GET_LENGTH(unicode);
15658
15659
10.4M
    _PyUnicode_LENGTH(self) = length;
15660
#ifdef Py_DEBUG
15661
    _PyUnicode_HASH(self) = -1;
15662
#else
15663
10.4M
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15664
10.4M
#endif
15665
10.4M
    _PyUnicode_STATE(self).interned = 0;
15666
10.4M
    _PyUnicode_STATE(self).kind = kind;
15667
10.4M
    _PyUnicode_STATE(self).compact = 0;
15668
10.4M
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15669
10.4M
    _PyUnicode_STATE(self).statically_allocated = 0;
15670
10.4M
    PyUnicode_SET_UTF8_LENGTH(self, 0);
15671
10.4M
    PyUnicode_SET_UTF8(self, NULL);
15672
10.4M
    _PyUnicode_DATA_ANY(self) = NULL;
15673
15674
10.4M
    share_utf8 = 0;
15675
10.4M
    if (kind == PyUnicode_1BYTE_KIND) {
15676
9.22M
        char_size = 1;
15677
9.22M
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15678
9.18M
            share_utf8 = 1;
15679
9.22M
    }
15680
1.26M
    else if (kind == PyUnicode_2BYTE_KIND) {
15681
1.21M
        char_size = 2;
15682
1.21M
    }
15683
52.8k
    else {
15684
52.8k
        assert(kind == PyUnicode_4BYTE_KIND);
15685
52.8k
        char_size = 4;
15686
52.8k
    }
15687
15688
    /* Ensure we won't overflow the length. */
15689
10.4M
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15690
0
        PyErr_NoMemory();
15691
0
        goto onError;
15692
0
    }
15693
10.4M
    data = PyMem_Malloc((length + 1) * char_size);
15694
10.4M
    if (data == NULL) {
15695
0
        PyErr_NoMemory();
15696
0
        goto onError;
15697
0
    }
15698
15699
10.4M
    _PyUnicode_DATA_ANY(self) = data;
15700
10.4M
    if (share_utf8) {
15701
9.18M
        PyUnicode_SET_UTF8_LENGTH(self, length);
15702
9.18M
        PyUnicode_SET_UTF8(self, data);
15703
9.18M
    }
15704
15705
10.4M
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
15706
10.4M
    assert(_PyUnicode_CheckConsistency(self, 1));
15707
#ifdef Py_DEBUG
15708
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15709
#endif
15710
10.4M
    return self;
15711
15712
0
onError:
15713
0
    Py_DECREF(self);
15714
0
    return NULL;
15715
10.4M
}
15716
15717
void
15718
_PyUnicode_ExactDealloc(PyObject *op)
15719
59.3M
{
15720
59.3M
    assert(PyUnicode_CheckExact(op));
15721
59.3M
    unicode_dealloc(op);
15722
59.3M
}
15723
15724
PyDoc_STRVAR(unicode_doc,
15725
"str(object='') -> str\n\
15726
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15727
\n\
15728
Create a new string object from the given object. If encoding or\n\
15729
errors is specified, then the object must expose a data buffer\n\
15730
that will be decoded using the given encoding and error handler.\n\
15731
Otherwise, returns the result of object.__str__() (if defined)\n\
15732
or repr(object).\n\
15733
encoding defaults to 'utf-8'.\n\
15734
errors defaults to 'strict'.");
15735
15736
static PyObject *unicode_iter(PyObject *seq);
15737
15738
PyTypeObject PyUnicode_Type = {
15739
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15740
    "str",                        /* tp_name */
15741
    sizeof(PyUnicodeObject),      /* tp_basicsize */
15742
    0,                            /* tp_itemsize */
15743
    /* Slots */
15744
    unicode_dealloc,              /* tp_dealloc */
15745
    0,                            /* tp_vectorcall_offset */
15746
    0,                            /* tp_getattr */
15747
    0,                            /* tp_setattr */
15748
    0,                            /* tp_as_async */
15749
    unicode_repr,                 /* tp_repr */
15750
    &unicode_as_number,           /* tp_as_number */
15751
    &unicode_as_sequence,         /* tp_as_sequence */
15752
    &unicode_as_mapping,          /* tp_as_mapping */
15753
    unicode_hash,                 /* tp_hash*/
15754
    0,                            /* tp_call*/
15755
    unicode_str,                  /* tp_str */
15756
    PyObject_GenericGetAttr,      /* tp_getattro */
15757
    0,                            /* tp_setattro */
15758
    0,                            /* tp_as_buffer */
15759
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15760
        Py_TPFLAGS_UNICODE_SUBCLASS |
15761
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
15762
    unicode_doc,                  /* tp_doc */
15763
    0,                            /* tp_traverse */
15764
    0,                            /* tp_clear */
15765
    PyUnicode_RichCompare,        /* tp_richcompare */
15766
    0,                            /* tp_weaklistoffset */
15767
    unicode_iter,                 /* tp_iter */
15768
    0,                            /* tp_iternext */
15769
    unicode_methods,              /* tp_methods */
15770
    0,                            /* tp_members */
15771
    0,                            /* tp_getset */
15772
    0,                            /* tp_base */
15773
    0,                            /* tp_dict */
15774
    0,                            /* tp_descr_get */
15775
    0,                            /* tp_descr_set */
15776
    0,                            /* tp_dictoffset */
15777
    0,                            /* tp_init */
15778
    0,                            /* tp_alloc */
15779
    unicode_new,                  /* tp_new */
15780
    PyObject_Free,                /* tp_free */
15781
    .tp_vectorcall = unicode_vectorcall,
15782
};
15783
15784
/* Initialize the Unicode implementation */
15785
15786
static void
15787
_init_global_state(void)
15788
16
{
15789
16
    static int initialized = 0;
15790
16
    if (initialized) {
15791
0
        return;
15792
0
    }
15793
16
    initialized = 1;
15794
15795
    /* initialize the linebreak bloom filter */
15796
16
    const Py_UCS2 linebreak[] = {
15797
16
        0x000A, /* LINE FEED */
15798
16
        0x000D, /* CARRIAGE RETURN */
15799
16
        0x001C, /* FILE SEPARATOR */
15800
16
        0x001D, /* GROUP SEPARATOR */
15801
16
        0x001E, /* RECORD SEPARATOR */
15802
16
        0x0085, /* NEXT LINE */
15803
16
        0x2028, /* LINE SEPARATOR */
15804
16
        0x2029, /* PARAGRAPH SEPARATOR */
15805
16
    };
15806
16
    bloom_linebreak = make_bloom_mask(
15807
16
        PyUnicode_2BYTE_KIND, linebreak,
15808
16
        Py_ARRAY_LENGTH(linebreak));
15809
16
}
15810
15811
void
15812
_PyUnicode_InitState(PyInterpreterState *interp)
15813
16
{
15814
16
    if (!_Py_IsMainInterpreter(interp)) {
15815
0
        return;
15816
0
    }
15817
16
    _init_global_state();
15818
16
}
15819
15820
15821
PyStatus
15822
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
15823
16
{
15824
16
    if (_Py_IsMainInterpreter(interp)) {
15825
16
        PyStatus status = init_global_interned_strings(interp);
15826
16
        if (_PyStatus_EXCEPTION(status)) {
15827
0
            return status;
15828
0
        }
15829
16
    }
15830
16
    assert(INTERNED_STRINGS);
15831
15832
16
    if (init_interned_dict(interp)) {
15833
0
        PyErr_Clear();
15834
0
        return _PyStatus_ERR("failed to create interned dict");
15835
0
    }
15836
15837
16
    return _PyStatus_OK();
15838
16
}
15839
15840
15841
PyStatus
15842
_PyUnicode_InitTypes(PyInterpreterState *interp)
15843
16
{
15844
16
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
15845
0
        goto error;
15846
0
    }
15847
16
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
15848
0
        goto error;
15849
0
    }
15850
16
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
15851
0
        goto error;
15852
0
    }
15853
16
    return _PyStatus_OK();
15854
15855
0
error:
15856
0
    return _PyStatus_ERR("Can't initialize unicode types");
15857
16
}
15858
15859
static /* non-null */ PyObject*
15860
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
15861
16.3k
{
15862
    // Note that this steals a reference to `s`, but in many cases that
15863
    // stolen ref is returned, requiring no decref/incref.
15864
15865
16.3k
    assert(s != NULL);
15866
16.3k
    assert(_PyUnicode_CHECK(s));
15867
16.3k
    assert(_PyUnicode_STATE(s).statically_allocated);
15868
16.3k
    assert(!PyUnicode_CHECK_INTERNED(s));
15869
15870
#ifdef Py_DEBUG
15871
    /* We must not add process-global interned string if there's already a
15872
     * per-interpreter interned_dict, which might contain duplicates.
15873
     */
15874
    PyObject *interned = get_interned_dict(interp);
15875
    assert(interned == NULL);
15876
#endif
15877
15878
    /* Look in the global cache first. */
15879
16.3k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
15880
    /* We should only init each string once */
15881
16.3k
    assert(r == NULL);
15882
    /* but just in case (for the non-debug build), handle this */
15883
16.3k
    if (r != NULL && r != s) {
15884
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
15885
0
        assert(_PyUnicode_CHECK(r));
15886
0
        Py_DECREF(s);
15887
0
        return Py_NewRef(r);
15888
0
    }
15889
15890
16.3k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
15891
0
        Py_FatalError("failed to intern static string");
15892
0
    }
15893
15894
16.3k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
15895
16.3k
    return s;
15896
16.3k
}
15897
15898
void
15899
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
15900
16.3k
{
15901
    // This should only be called as part of runtime initialization
15902
16.3k
    assert(!Py_IsInitialized());
15903
15904
16.3k
    *p = intern_static(interp, *p);
15905
16.3k
    assert(*p);
15906
16.3k
}
15907
15908
static void
15909
immortalize_interned(PyObject *s)
15910
91.9k
{
15911
91.9k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
15912
91.9k
    assert(!_Py_IsImmortal(s));
15913
#ifdef Py_REF_DEBUG
15914
    /* The reference count value should be excluded from the RefTotal.
15915
       The decrements to these objects will not be registered so they
15916
       need to be accounted for in here. */
15917
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
15918
        _Py_DecRefTotal(_PyThreadState_GET());
15919
    }
15920
#endif
15921
91.9k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
15922
91.9k
    _Py_SetImmortal(s);
15923
91.9k
}
15924
15925
static /* non-null */ PyObject*
15926
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
15927
              bool immortalize)
15928
35.8M
{
15929
    // Note that this steals a reference to `s`, but in many cases that
15930
    // stolen ref is returned, requiring no decref/incref.
15931
15932
#ifdef Py_DEBUG
15933
    assert(s != NULL);
15934
    assert(_PyUnicode_CHECK(s));
15935
#else
15936
35.8M
    if (s == NULL || !PyUnicode_Check(s)) {
15937
0
        return s;
15938
0
    }
15939
35.8M
#endif
15940
15941
    /* If it's a subclass, we don't really know what putting
15942
       it in the interned dict might do. */
15943
35.8M
    if (!PyUnicode_CheckExact(s)) {
15944
0
        return s;
15945
0
    }
15946
15947
    /* Is it already interned? */
15948
35.8M
    switch (PyUnicode_CHECK_INTERNED(s)) {
15949
3.26M
        case SSTATE_NOT_INTERNED:
15950
            // no, go on
15951
3.26M
            break;
15952
12.9k
        case SSTATE_INTERNED_MORTAL:
15953
            // yes but we might need to make it immortal
15954
12.9k
            if (immortalize) {
15955
34
                immortalize_interned(s);
15956
34
            }
15957
12.9k
            return s;
15958
32.5M
        default:
15959
            // all done
15960
32.5M
            return s;
15961
35.8M
    }
15962
15963
    /* Statically allocated strings must be already interned. */
15964
3.26M
    assert(!_PyUnicode_STATE(s).statically_allocated);
15965
15966
#if Py_GIL_DISABLED
15967
    /* In the free-threaded build, all interned strings are immortal */
15968
    immortalize = 1;
15969
#endif
15970
15971
    /* If it's already immortal, intern it as such */
15972
3.26M
    if (_Py_IsImmortal(s)) {
15973
0
        immortalize = 1;
15974
0
    }
15975
15976
    /* if it's a short string, get the singleton */
15977
3.26M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
15978
3.26M
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
15979
0
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
15980
0
        assert(PyUnicode_CHECK_INTERNED(r));
15981
0
        Py_DECREF(s);
15982
0
        return r;
15983
0
    }
15984
#ifdef Py_DEBUG
15985
    assert(!unicode_is_singleton(s));
15986
#endif
15987
15988
    /* Look in the global cache now. */
15989
3.26M
    {
15990
3.26M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
15991
3.26M
        if (r != NULL) {
15992
241k
            assert(_PyUnicode_STATE(r).statically_allocated);
15993
241k
            assert(r != s);  // r must be statically_allocated; s is not
15994
241k
            Py_DECREF(s);
15995
241k
            return Py_NewRef(r);
15996
241k
        }
15997
3.26M
    }
15998
15999
    /* Do a setdefault on the per-interpreter cache. */
16000
3.02M
    PyObject *interned = get_interned_dict(interp);
16001
3.02M
    assert(interned != NULL);
16002
16003
3.02M
    LOCK_INTERNED(interp);
16004
3.02M
    PyObject *t;
16005
3.02M
    {
16006
3.02M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
16007
3.02M
        if (res < 0) {
16008
0
            PyErr_Clear();
16009
0
            UNLOCK_INTERNED(interp);
16010
0
            return s;
16011
0
        }
16012
3.02M
        else if (res == 1) {
16013
            // value was already present (not inserted)
16014
2.31M
            Py_DECREF(s);
16015
2.31M
            if (immortalize &&
16016
2.31M
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
16017
4.09k
                immortalize_interned(t);
16018
4.09k
            }
16019
2.31M
            UNLOCK_INTERNED(interp);
16020
2.31M
            return t;
16021
2.31M
        }
16022
700k
        else {
16023
            // value was newly inserted
16024
700k
            assert (s == t);
16025
700k
            Py_DECREF(t);
16026
700k
        }
16027
3.02M
    }
16028
16029
    /* NOT_INTERNED -> INTERNED_MORTAL */
16030
16031
700k
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
16032
16033
700k
    if (!_Py_IsImmortal(s)) {
16034
        /* The two references in interned dict (key and value) are not counted.
16035
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
16036
700k
        Py_DECREF(s);
16037
700k
        Py_DECREF(s);
16038
700k
    }
16039
700k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
16040
16041
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
16042
16043
#ifdef Py_DEBUG
16044
    if (_Py_IsImmortal(s)) {
16045
        assert(immortalize);
16046
    }
16047
#endif
16048
700k
    if (immortalize) {
16049
87.8k
        immortalize_interned(s);
16050
87.8k
    }
16051
16052
700k
    UNLOCK_INTERNED(interp);
16053
700k
    return s;
16054
3.02M
}
16055
16056
void
16057
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
16058
2.63M
{
16059
2.63M
    *p = intern_common(interp, *p, 1);
16060
2.63M
    assert(*p);
16061
2.63M
}
16062
16063
void
16064
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
16065
33.2M
{
16066
33.2M
    *p = intern_common(interp, *p, 0);
16067
33.2M
    assert(*p);
16068
33.2M
}
16069
16070
16071
void
16072
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
16073
0
{
16074
0
    _PyUnicode_InternImmortal(interp, p);
16075
0
    return;
16076
0
}
16077
16078
void
16079
PyUnicode_InternInPlace(PyObject **p)
16080
0
{
16081
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
16082
0
    _PyUnicode_InternMortal(interp, p);
16083
0
}
16084
16085
// Public-looking name kept for the stable ABI; user should not call this:
16086
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
16087
void
16088
PyUnicode_InternImmortal(PyObject **p)
16089
0
{
16090
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
16091
0
    _PyUnicode_InternImmortal(interp, p);
16092
0
}
16093
16094
PyObject *
16095
PyUnicode_InternFromString(const char *cp)
16096
1.14M
{
16097
1.14M
    PyObject *s = PyUnicode_FromString(cp);
16098
1.14M
    if (s == NULL) {
16099
0
        return NULL;
16100
0
    }
16101
1.14M
    PyInterpreterState *interp = _PyInterpreterState_GET();
16102
1.14M
    _PyUnicode_InternMortal(interp, &s);
16103
1.14M
    return s;
16104
1.14M
}
16105
16106
16107
void
16108
_PyUnicode_ClearInterned(PyInterpreterState *interp)
16109
0
{
16110
0
    PyObject *interned = get_interned_dict(interp);
16111
0
    if (interned == NULL) {
16112
0
        return;
16113
0
    }
16114
0
    assert(PyDict_CheckExact(interned));
16115
16116
0
    if (has_shared_intern_dict(interp)) {
16117
        // the dict doesn't belong to this interpreter, skip the debug
16118
        // checks on it and just clear the pointer to it
16119
0
        clear_interned_dict(interp);
16120
0
        return;
16121
0
    }
16122
16123
#ifdef INTERNED_STATS
16124
    fprintf(stderr, "releasing %zd interned strings\n",
16125
            PyDict_GET_SIZE(interned));
16126
16127
    Py_ssize_t total_length = 0;
16128
#endif
16129
0
    Py_ssize_t pos = 0;
16130
0
    PyObject *s, *ignored_value;
16131
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
16132
0
        int shared = 0;
16133
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
16134
0
        case SSTATE_INTERNED_IMMORTAL:
16135
            /* Make immortal interned strings mortal again. */
16136
            // Skip the Immortal Instance check and restore
16137
            // the two references (key and value) ignored
16138
            // by PyUnicode_InternInPlace().
16139
0
            _Py_SetMortal(s, 2);
16140
#ifdef Py_REF_DEBUG
16141
            /* let's be pedantic with the ref total */
16142
            _Py_IncRefTotal(_PyThreadState_GET());
16143
            _Py_IncRefTotal(_PyThreadState_GET());
16144
#endif
16145
#ifdef INTERNED_STATS
16146
            total_length += PyUnicode_GET_LENGTH(s);
16147
#endif
16148
0
            break;
16149
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
16150
            /* It is shared between interpreters, so we should unmark it
16151
               only when this is the last interpreter in which it's
16152
               interned.  We immortalize all the statically initialized
16153
               strings during startup, so we can rely on the
16154
               main interpreter to be the last one. */
16155
0
            if (!_Py_IsMainInterpreter(interp)) {
16156
0
                shared = 1;
16157
0
            }
16158
0
            break;
16159
0
        case SSTATE_INTERNED_MORTAL:
16160
            // Restore 2 references held by the interned dict; these will
16161
            // be decref'd by clear_interned_dict's PyDict_Clear.
16162
0
            _Py_RefcntAdd(s, 2);
16163
#ifdef Py_REF_DEBUG
16164
            /* let's be pedantic with the ref total */
16165
            _Py_IncRefTotal(_PyThreadState_GET());
16166
            _Py_IncRefTotal(_PyThreadState_GET());
16167
#endif
16168
0
            break;
16169
0
        case SSTATE_NOT_INTERNED:
16170
0
            _Py_FALLTHROUGH;
16171
0
        default:
16172
0
            Py_UNREACHABLE();
16173
0
        }
16174
0
        if (!shared) {
16175
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
16176
0
        }
16177
0
    }
16178
#ifdef INTERNED_STATS
16179
    fprintf(stderr,
16180
            "total length of all interned strings: %zd characters\n",
16181
            total_length);
16182
#endif
16183
16184
0
    struct _Py_unicode_state *state = &interp->unicode;
16185
0
    struct _Py_unicode_ids *ids = &state->ids;
16186
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
16187
0
        Py_XINCREF(ids->array[i]);
16188
0
    }
16189
0
    clear_interned_dict(interp);
16190
0
    if (_Py_IsMainInterpreter(interp)) {
16191
0
        clear_global_interned_strings();
16192
0
    }
16193
0
}
16194
16195
16196
/********************* Unicode Iterator **************************/
16197
16198
typedef struct {
16199
    PyObject_HEAD
16200
    Py_ssize_t it_index;
16201
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
16202
} unicodeiterobject;
16203
16204
static void
16205
unicodeiter_dealloc(PyObject *op)
16206
1.67M
{
16207
1.67M
    unicodeiterobject *it = (unicodeiterobject *)op;
16208
1.67M
    _PyObject_GC_UNTRACK(it);
16209
1.67M
    Py_XDECREF(it->it_seq);
16210
1.67M
    PyObject_GC_Del(it);
16211
1.67M
}
16212
16213
static int
16214
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
16215
14
{
16216
14
    unicodeiterobject *it = (unicodeiterobject *)op;
16217
14
    Py_VISIT(it->it_seq);
16218
14
    return 0;
16219
14
}
16220
16221
static PyObject *
16222
unicodeiter_next(PyObject *op)
16223
128M
{
16224
128M
    unicodeiterobject *it = (unicodeiterobject *)op;
16225
128M
    PyObject *seq;
16226
16227
128M
    assert(it != NULL);
16228
128M
    seq = it->it_seq;
16229
128M
    if (seq == NULL)
16230
0
        return NULL;
16231
128M
    assert(_PyUnicode_CHECK(seq));
16232
16233
128M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
16234
127M
        int kind = PyUnicode_KIND(seq);
16235
127M
        const void *data = PyUnicode_DATA(seq);
16236
127M
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
16237
127M
        it->it_index++;
16238
127M
        return unicode_char(chr);
16239
127M
    }
16240
16241
794k
    it->it_seq = NULL;
16242
794k
    Py_DECREF(seq);
16243
794k
    return NULL;
16244
128M
}
16245
16246
static PyObject *
16247
unicode_ascii_iter_next(PyObject *op)
16248
108M
{
16249
108M
    unicodeiterobject *it = (unicodeiterobject *)op;
16250
108M
    assert(it != NULL);
16251
108M
    PyObject *seq = it->it_seq;
16252
108M
    if (seq == NULL) {
16253
0
        return NULL;
16254
0
    }
16255
108M
    assert(_PyUnicode_CHECK(seq));
16256
108M
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
16257
108M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
16258
107M
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
16259
107M
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
16260
107M
                                              data, it->it_index);
16261
107M
        it->it_index++;
16262
107M
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
16263
107M
    }
16264
818k
    it->it_seq = NULL;
16265
818k
    Py_DECREF(seq);
16266
818k
    return NULL;
16267
108M
}
16268
16269
static PyObject *
16270
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
16271
0
{
16272
0
    unicodeiterobject *it = (unicodeiterobject *)op;
16273
0
    Py_ssize_t len = 0;
16274
0
    if (it->it_seq)
16275
0
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
16276
0
    return PyLong_FromSsize_t(len);
16277
0
}
16278
16279
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
16280
16281
static PyObject *
16282
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
16283
0
{
16284
0
    unicodeiterobject *it = (unicodeiterobject *)op;
16285
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
16286
16287
    /* _PyEval_GetBuiltin can invoke arbitrary code,
16288
     * call must be before access of iterator pointers.
16289
     * see issue #101765 */
16290
16291
0
    if (it->it_seq != NULL) {
16292
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
16293
0
    } else {
16294
0
        PyObject *u = unicode_get_empty();
16295
0
        if (u == NULL) {
16296
0
            Py_XDECREF(iter);
16297
0
            return NULL;
16298
0
        }
16299
0
        return Py_BuildValue("N(N)", iter, u);
16300
0
    }
16301
0
}
16302
16303
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
16304
16305
static PyObject *
16306
unicodeiter_setstate(PyObject *op, PyObject *state)
16307
0
{
16308
0
    unicodeiterobject *it = (unicodeiterobject *)op;
16309
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
16310
0
    if (index == -1 && PyErr_Occurred())
16311
0
        return NULL;
16312
0
    if (it->it_seq != NULL) {
16313
0
        if (index < 0)
16314
0
            index = 0;
16315
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
16316
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
16317
0
        it->it_index = index;
16318
0
    }
16319
0
    Py_RETURN_NONE;
16320
0
}
16321
16322
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
16323
16324
static PyMethodDef unicodeiter_methods[] = {
16325
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
16326
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
16327
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
16328
    {NULL,      NULL}       /* sentinel */
16329
};
16330
16331
PyTypeObject PyUnicodeIter_Type = {
16332
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
16333
    "str_iterator",         /* tp_name */
16334
    sizeof(unicodeiterobject),      /* tp_basicsize */
16335
    0,                  /* tp_itemsize */
16336
    /* methods */
16337
    unicodeiter_dealloc,/* tp_dealloc */
16338
    0,                  /* tp_vectorcall_offset */
16339
    0,                  /* tp_getattr */
16340
    0,                  /* tp_setattr */
16341
    0,                  /* tp_as_async */
16342
    0,                  /* tp_repr */
16343
    0,                  /* tp_as_number */
16344
    0,                  /* tp_as_sequence */
16345
    0,                  /* tp_as_mapping */
16346
    0,                  /* tp_hash */
16347
    0,                  /* tp_call */
16348
    0,                  /* tp_str */
16349
    PyObject_GenericGetAttr,        /* tp_getattro */
16350
    0,                  /* tp_setattro */
16351
    0,                  /* tp_as_buffer */
16352
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
16353
    0,                  /* tp_doc */
16354
    unicodeiter_traverse, /* tp_traverse */
16355
    0,                  /* tp_clear */
16356
    0,                  /* tp_richcompare */
16357
    0,                  /* tp_weaklistoffset */
16358
    PyObject_SelfIter,          /* tp_iter */
16359
    unicodeiter_next,   /* tp_iternext */
16360
    unicodeiter_methods,            /* tp_methods */
16361
    0,
16362
};
16363
16364
PyTypeObject _PyUnicodeASCIIIter_Type = {
16365
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
16366
    .tp_name = "str_ascii_iterator",
16367
    .tp_basicsize = sizeof(unicodeiterobject),
16368
    .tp_dealloc = unicodeiter_dealloc,
16369
    .tp_getattro = PyObject_GenericGetAttr,
16370
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
16371
    .tp_traverse = unicodeiter_traverse,
16372
    .tp_iter = PyObject_SelfIter,
16373
    .tp_iternext = unicode_ascii_iter_next,
16374
    .tp_methods = unicodeiter_methods,
16375
};
16376
16377
static PyObject *
16378
unicode_iter(PyObject *seq)
16379
1.67M
{
16380
1.67M
    unicodeiterobject *it;
16381
16382
1.67M
    if (!PyUnicode_Check(seq)) {
16383
0
        PyErr_BadInternalCall();
16384
0
        return NULL;
16385
0
    }
16386
1.67M
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
16387
877k
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
16388
877k
    }
16389
794k
    else {
16390
794k
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16391
794k
    }
16392
1.67M
    if (it == NULL)
16393
0
        return NULL;
16394
1.67M
    it->it_index = 0;
16395
1.67M
    it->it_seq = Py_NewRef(seq);
16396
1.67M
    _PyObject_GC_TRACK(it);
16397
1.67M
    return (PyObject *)it;
16398
1.67M
}
16399
16400
static int
16401
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
16402
64
{
16403
64
    int res;
16404
64
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16405
64
    if (res == -2) {
16406
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
16407
0
        return -1;
16408
0
    }
16409
64
    if (res < 0) {
16410
0
        PyErr_NoMemory();
16411
0
        return -1;
16412
0
    }
16413
64
    return 0;
16414
64
}
16415
16416
16417
static int
16418
config_get_codec_name(wchar_t **config_encoding)
16419
32
{
16420
32
    char *encoding;
16421
32
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16422
0
        return -1;
16423
0
    }
16424
16425
32
    PyObject *name_obj = NULL;
16426
32
    PyObject *codec = _PyCodec_Lookup(encoding);
16427
32
    PyMem_RawFree(encoding);
16428
16429
32
    if (!codec)
16430
0
        goto error;
16431
16432
32
    name_obj = PyObject_GetAttrString(codec, "name");
16433
32
    Py_CLEAR(codec);
16434
32
    if (!name_obj) {
16435
0
        goto error;
16436
0
    }
16437
16438
32
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16439
32
    Py_DECREF(name_obj);
16440
32
    if (wname == NULL) {
16441
0
        goto error;
16442
0
    }
16443
16444
32
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16445
32
    if (raw_wname == NULL) {
16446
0
        PyMem_Free(wname);
16447
0
        PyErr_NoMemory();
16448
0
        goto error;
16449
0
    }
16450
16451
32
    PyMem_RawFree(*config_encoding);
16452
32
    *config_encoding = raw_wname;
16453
16454
32
    PyMem_Free(wname);
16455
32
    return 0;
16456
16457
0
error:
16458
0
    Py_XDECREF(codec);
16459
0
    Py_XDECREF(name_obj);
16460
0
    return -1;
16461
32
}
16462
16463
16464
static PyStatus
16465
init_stdio_encoding(PyInterpreterState *interp)
16466
16
{
16467
    /* Update the stdio encoding to the normalized Python codec name. */
16468
16
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16469
16
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
16470
0
        return _PyStatus_ERR("failed to get the Python codec name "
16471
0
                             "of the stdio encoding");
16472
0
    }
16473
16
    return _PyStatus_OK();
16474
16
}
16475
16476
16477
static int
16478
init_fs_codec(PyInterpreterState *interp)
16479
16
{
16480
16
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
16481
16482
16
    _Py_error_handler error_handler;
16483
16
    error_handler = get_error_handler_wide(config->filesystem_errors);
16484
16
    if (error_handler == _Py_ERROR_UNKNOWN) {
16485
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
16486
0
        return -1;
16487
0
    }
16488
16489
16
    char *encoding, *errors;
16490
16
    if (encode_wstr_utf8(config->filesystem_encoding,
16491
16
                         &encoding,
16492
16
                         "filesystem_encoding") < 0) {
16493
0
        return -1;
16494
0
    }
16495
16496
16
    if (encode_wstr_utf8(config->filesystem_errors,
16497
16
                         &errors,
16498
16
                         "filesystem_errors") < 0) {
16499
0
        PyMem_RawFree(encoding);
16500
0
        return -1;
16501
0
    }
16502
16503
16
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16504
16
    PyMem_RawFree(fs_codec->encoding);
16505
16
    fs_codec->encoding = encoding;
16506
    /* encoding has been normalized by init_fs_encoding() */
16507
16
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16508
16
    PyMem_RawFree(fs_codec->errors);
16509
16
    fs_codec->errors = errors;
16510
16
    fs_codec->error_handler = error_handler;
16511
16512
#ifdef _Py_FORCE_UTF8_FS_ENCODING
16513
    assert(fs_codec->utf8 == 1);
16514
#endif
16515
16516
    /* At this point, PyUnicode_EncodeFSDefault() and
16517
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16518
       the C implementation of the filesystem encoding. */
16519
16520
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16521
       global configuration variables. */
16522
16
    if (_Py_IsMainInterpreter(interp)) {
16523
16524
16
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16525
16
                                      fs_codec->errors) < 0) {
16526
0
            PyErr_NoMemory();
16527
0
            return -1;
16528
0
        }
16529
16
    }
16530
16
    return 0;
16531
16
}
16532
16533
16534
static PyStatus
16535
init_fs_encoding(PyThreadState *tstate)
16536
16
{
16537
16
    PyInterpreterState *interp = tstate->interp;
16538
16539
    /* Update the filesystem encoding to the normalized Python codec name.
16540
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16541
       (Python codec name). */
16542
16
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16543
16
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
16544
0
        _Py_DumpPathConfig(tstate);
16545
0
        return _PyStatus_ERR("failed to get the Python codec "
16546
0
                             "of the filesystem encoding");
16547
0
    }
16548
16549
16
    if (init_fs_codec(interp) < 0) {
16550
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
16551
0
    }
16552
16
    return _PyStatus_OK();
16553
16
}
16554
16555
16556
PyStatus
16557
_PyUnicode_InitEncodings(PyThreadState *tstate)
16558
16
{
16559
16
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
16560
16
    if (_PyStatus_EXCEPTION(status)) {
16561
0
        return status;
16562
0
    }
16563
16
    status = init_fs_encoding(tstate);
16564
16
    if (_PyStatus_EXCEPTION(status)) {
16565
0
        return status;
16566
0
    }
16567
16568
16
    return init_stdio_encoding(tstate->interp);
16569
16
}
16570
16571
16572
static void
16573
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
16574
0
{
16575
0
    PyMem_RawFree(fs_codec->encoding);
16576
0
    fs_codec->encoding = NULL;
16577
0
    fs_codec->utf8 = 0;
16578
0
    PyMem_RawFree(fs_codec->errors);
16579
0
    fs_codec->errors = NULL;
16580
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
16581
0
}
16582
16583
16584
#ifdef MS_WINDOWS
16585
int
16586
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16587
{
16588
    PyInterpreterState *interp = _PyInterpreterState_GET();
16589
    PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
16590
16591
    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16592
    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16593
    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16594
    if (encoding == NULL || errors == NULL) {
16595
        PyMem_RawFree(encoding);
16596
        PyMem_RawFree(errors);
16597
        PyErr_NoMemory();
16598
        return -1;
16599
    }
16600
16601
    PyMem_RawFree(config->filesystem_encoding);
16602
    config->filesystem_encoding = encoding;
16603
    PyMem_RawFree(config->filesystem_errors);
16604
    config->filesystem_errors = errors;
16605
16606
    return init_fs_codec(interp);
16607
}
16608
#endif
16609
16610
16611
#ifdef Py_DEBUG
16612
static inline int
16613
unicode_is_finalizing(void)
16614
{
16615
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
16616
}
16617
#endif
16618
16619
16620
void
16621
_PyUnicode_FiniTypes(PyInterpreterState *interp)
16622
0
{
16623
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
16624
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
16625
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
16626
0
}
16627
16628
16629
void
16630
_PyUnicode_Fini(PyInterpreterState *interp)
16631
0
{
16632
0
    struct _Py_unicode_state *state = &interp->unicode;
16633
16634
0
    if (!has_shared_intern_dict(interp)) {
16635
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
16636
0
        assert(get_interned_dict(interp) == NULL);
16637
0
    }
16638
16639
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
16640
16641
    // bpo-47182: force a unicodedata CAPI capsule re-import on
16642
    // subsequent initialization of interpreter.
16643
0
    interp->unicode.ucnhash_capi = NULL;
16644
16645
0
    unicode_clear_identifiers(state);
16646
0
}
16647
16648
/* A _string module, to export formatter_parser and formatter_field_name_split
16649
   to the string.Formatter class implemented in Python. */
16650
16651
static PyMethodDef _string_methods[] = {
16652
    {"formatter_field_name_split", formatter_field_name_split,
16653
     METH_O, PyDoc_STR("split the argument as a field name")},
16654
    {"formatter_parser", formatter_parser,
16655
     METH_O, PyDoc_STR("parse the argument as a format string")},
16656
    {NULL, NULL}
16657
};
16658
16659
static PyModuleDef_Slot module_slots[] = {
16660
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
16661
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
16662
    {0, NULL}
16663
};
16664
16665
static struct PyModuleDef _string_module = {
16666
    PyModuleDef_HEAD_INIT,
16667
    .m_name = "_string",
16668
    .m_doc = PyDoc_STR("string helper module"),
16669
    .m_size = 0,
16670
    .m_methods = _string_methods,
16671
    .m_slots = module_slots,
16672
};
16673
16674
PyMODINIT_FUNC
16675
PyInit__string(void)
16676
6
{
16677
6
    return PyModuleDef_Init(&_string_module);
16678
6
}
16679
16680
16681
#undef PyUnicode_KIND
16682
int PyUnicode_KIND(PyObject *op)
16683
0
{
16684
0
    if (!PyUnicode_Check(op)) {
16685
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
16686
0
        return -1;
16687
0
    }
16688
0
    return _PyASCIIObject_CAST(op)->state.kind;
16689
0
}
16690
16691
#undef PyUnicode_DATA
16692
void* PyUnicode_DATA(PyObject *op)
16693
0
{
16694
0
    if (!PyUnicode_Check(op)) {
16695
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
16696
0
        return NULL;
16697
0
    }
16698
0
    return _PyUnicode_DATA(op);
16699
0
}