Coverage Report

Created: 2026-06-21 06:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Objects/unicodeobject.c
Line
Count
Source
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_RepeatBuffer()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_initconfig.h"    // _PyStatus_OK()
50
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
51
#include "pycore_long.h"          // _PyLong_FormatWriter()
52
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
53
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
54
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
55
#include "pycore_pyhash.h"        // _Py_HashSecret_t
56
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
57
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
58
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
59
#include "pycore_unicodectype.h"  // _PyUnicode_IsXidStart
60
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
61
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
62
63
#include "stringlib/eq.h"         // unicode_eq()
64
#include <stddef.h>               // ptrdiff_t
65
66
#ifdef MS_WINDOWS
67
#include <windows.h>
68
#endif
69
70
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
71
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
72
#endif
73
74
/* Uncomment to display statistics on interned strings at exit
75
   in _PyUnicode_ClearInterned(). */
76
/* #define INTERNED_STATS 1 */
77
78
79
/*[clinic input]
80
class str "PyObject *" "&PyUnicode_Type"
81
[clinic start generated code]*/
82
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
83
84
/*[python input]
85
class Py_UCS4_converter(CConverter):
86
    type = 'Py_UCS4'
87
    converter = 'convert_uc'
88
89
    def c_default_init(self):
90
        import libclinic
91
        self.c_default = libclinic.c_unichar_repr(self.default)
92
93
[python start generated code]*/
94
/*[python end generated code: output=da39a3ee5e6b4b0d input=22f057b68fd9a65a]*/
95
96
/* --- Globals ------------------------------------------------------------
97
98
NOTE: In the interpreter's initialization phase, some globals are currently
99
      initialized dynamically as needed. In the process Unicode objects may
100
      be created before the Unicode type is ready.
101
102
*/
103
104
21.3M
#define MAX_UNICODE _Py_MAX_UNICODE
105
270M
#define ensure_unicode _PyUnicode_EnsureUnicode
106
107
#ifdef Py_DEBUG
108
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
109
#else
110
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
111
#endif
112
113
static inline char* _PyUnicode_UTF8(PyObject *op)
114
247M
{
115
247M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
116
247M
}
117
118
static inline char* PyUnicode_UTF8(PyObject *op)
119
160M
{
120
160M
    assert(_PyUnicode_CHECK(op));
121
160M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
122
145M
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
123
145M
    }
124
14.7M
    else {
125
14.7M
         return _PyUnicode_UTF8(op);
126
14.7M
    }
127
160M
}
128
129
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
130
18.6M
{
131
18.6M
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
132
18.6M
}
133
134
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
135
75.3M
{
136
75.3M
    assert(_PyUnicode_CHECK(op));
137
75.3M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
138
72.5M
         return _PyASCIIObject_CAST(op)->length;
139
72.5M
    }
140
2.82M
    else {
141
2.82M
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
142
2.82M
    }
143
75.3M
}
144
145
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
146
18.6M
{
147
18.6M
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
148
18.6M
}
149
150
#define _PyUnicode_LENGTH(op)                           \
151
602M
    (_PyASCIIObject_CAST(op)->length)
152
#define _PyUnicode_STATE(op)                            \
153
3.80G
    (_PyASCIIObject_CAST(op)->state)
154
#define _PyUnicode_HASH(op)                             \
155
552M
    (_PyASCIIObject_CAST(op)->hash)
156
157
282M
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
158
159
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
160
50.2M
{
161
50.2M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
162
50.2M
}
163
164
#define _PyUnicode_DATA_ANY(op)                         \
165
39.2M
    (_PyUnicodeObject_CAST(op)->data.any)
166
167
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
168
0
{
169
0
    assert(_PyUnicode_CHECK(op));
170
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
171
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
172
0
}
173
174
/* true if the Unicode object has an allocated UTF-8 memory block
175
   (not shared with other data) */
176
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
177
601M
{
178
601M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
179
224M
            && _PyUnicode_UTF8(op) != NULL
180
8.81M
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
181
601M
}
182
183
184
234M
#define LATIN1 _Py_LATIN1_CHR
185
186
/* Forward declaration */
187
static PyObject *
188
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
189
                    const char *errors);
190
static PyObject *
191
unicode_decode_utf8(const char *s, Py_ssize_t size,
192
                    _Py_error_handler error_handler, const char *errors,
193
                    Py_ssize_t *consumed);
194
#ifdef Py_DEBUG
195
static inline int unicode_is_finalizing(void);
196
static int unicode_is_singleton(PyObject *unicode);
197
#endif
198
199
200
// Return a reference to the immortal empty string singleton.
201
PyObject*
202
_PyUnicode_GetEmpty(void)
203
94.9M
{
204
94.9M
    _Py_DECLARE_STR(empty, "");
205
94.9M
    return &_Py_STR(empty);
206
94.9M
}
207
208
/* This dictionary holds per-interpreter interned strings.
209
 * See InternalDocs/string_interning.md for details.
210
 */
211
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
212
5.62M
{
213
5.62M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
214
5.62M
}
215
216
/* This hashtable holds statically allocated interned strings.
217
 * See InternalDocs/string_interning.md for details.
218
 */
219
6.01M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
220
221
/* Get number of all interned strings for the current interpreter. */
222
Py_ssize_t
223
_PyUnicode_InternedSize(void)
224
0
{
225
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
226
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
227
0
}
228
229
/* Get number of immortal interned strings for the current interpreter. */
230
Py_ssize_t
231
_PyUnicode_InternedSize_Immortal(void)
232
0
{
233
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
234
0
    PyObject *key, *value;
235
0
    Py_ssize_t pos = 0;
236
0
    Py_ssize_t count = 0;
237
238
    // It's tempting to keep a count and avoid a loop here. But, this function
239
    // is intended for refleak tests. It spends extra work to report the true
240
    // value, to help detect bugs in optimizations.
241
242
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
243
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
244
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
245
0
           count++;
246
0
       }
247
0
    }
248
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
249
0
}
250
251
static Py_hash_t unicode_hash(PyObject *);
252
253
static Py_uhash_t
254
hashtable_unicode_hash(const void *key)
255
6.01M
{
256
6.01M
    return unicode_hash((PyObject *)key);
257
6.01M
}
258
259
static int
260
hashtable_unicode_compare(const void *key1, const void *key2)
261
514k
{
262
514k
    PyObject *obj1 = (PyObject *)key1;
263
514k
    PyObject *obj2 = (PyObject *)key2;
264
514k
    if (obj1 != NULL && obj2 != NULL) {
265
514k
        return unicode_eq(obj1, obj2);
266
514k
    }
267
0
    else {
268
0
        return obj1 == obj2;
269
0
    }
270
514k
}
271
272
/* Return true if this interpreter should share the main interpreter's
273
   intern_dict.  That's important for interpreters which load basic
274
   single-phase init extension modules (m_size == -1).  There could be interned
275
   immortal strings that are shared between interpreters, due to the
276
   PyDict_Update(mdict, m_copy) call in import_find_extension().
277
278
   It's not safe to deallocate those strings until all interpreters that
279
   potentially use them are freed.  By storing them in the main interpreter, we
280
   ensure they get freed after all other interpreters are freed.
281
*/
282
static bool
283
has_shared_intern_dict(PyInterpreterState *interp)
284
36
{
285
36
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
286
36
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
287
36
}
288
289
static int
290
init_interned_dict(PyInterpreterState *interp)
291
36
{
292
36
    assert(get_interned_dict(interp) == NULL);
293
36
    PyObject *interned;
294
36
    if (has_shared_intern_dict(interp)) {
295
0
        interned = get_interned_dict(_PyInterpreterState_Main());
296
0
        Py_INCREF(interned);
297
0
    }
298
36
    else {
299
36
        interned = PyDict_New();
300
36
        if (interned == NULL) {
301
0
            return -1;
302
0
        }
303
36
    }
304
36
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
305
36
    return 0;
306
36
}
307
308
static void
309
clear_interned_dict(PyInterpreterState *interp)
310
0
{
311
0
    PyObject *interned = get_interned_dict(interp);
312
0
    if (interned != NULL) {
313
0
        if (!has_shared_intern_dict(interp)) {
314
            // only clear if the dict belongs to this interpreter
315
0
            PyDict_Clear(interned);
316
0
        }
317
0
        Py_DECREF(interned);
318
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
319
0
    }
320
0
}
321
322
static PyStatus
323
init_global_interned_strings(PyInterpreterState *interp)
324
36
{
325
36
    assert(INTERNED_STRINGS == NULL);
326
36
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
327
328
36
    INTERNED_STRINGS = _Py_hashtable_new_full(
329
36
        hashtable_unicode_hash,
330
36
        hashtable_unicode_compare,
331
        // Objects stored here are immortal and statically allocated,
332
        // so we don't need key_destroy_func & value_destroy_func:
333
36
        NULL,
334
36
        NULL,
335
36
        &hashtable_alloc
336
36
    );
337
36
    if (INTERNED_STRINGS == NULL) {
338
0
        PyErr_Clear();
339
0
        return _PyStatus_ERR("failed to create global interned dict");
340
0
    }
341
342
    /* Intern statically allocated string identifiers, deepfreeze strings,
343
        * and one-byte latin-1 strings.
344
        * This must be done before any module initialization so that statically
345
        * allocated string identifiers are used instead of heap allocated strings.
346
        * Deepfreeze uses the interned identifiers if present to save space
347
        * else generates them and they are interned to speed up dict lookups.
348
    */
349
36
    _PyUnicode_InitStaticStrings(interp);
350
351
9.25k
    for (int i = 0; i < 256; i++) {
352
9.21k
        PyObject *s = LATIN1(i);
353
9.21k
        _PyUnicode_InternStatic(interp, &s);
354
9.21k
        assert(s == LATIN1(i));
355
9.21k
    }
356
#ifdef Py_DEBUG
357
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
358
359
    for (int i = 0; i < 256; i++) {
360
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
361
    }
362
#endif
363
36
    return _PyStatus_OK();
364
36
}
365
366
static void clear_global_interned_strings(void)
367
0
{
368
0
    if (INTERNED_STRINGS != NULL) {
369
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
370
0
        INTERNED_STRINGS = NULL;
371
0
    }
372
0
}
373
374
#define _Py_RETURN_UNICODE_EMPTY()   \
375
46.3M
    do {                             \
376
46.3M
        return _PyUnicode_GetEmpty();\
377
46.3M
    } while (0)
378
379
380
/* Fast detection of the most frequent whitespace characters */
381
const unsigned char _Py_ascii_whitespace[] = {
382
    0, 0, 0, 0, 0, 0, 0, 0,
383
/*     case 0x0009: * CHARACTER TABULATION */
384
/*     case 0x000A: * LINE FEED */
385
/*     case 0x000B: * LINE TABULATION */
386
/*     case 0x000C: * FORM FEED */
387
/*     case 0x000D: * CARRIAGE RETURN */
388
    0, 1, 1, 1, 1, 1, 0, 0,
389
    0, 0, 0, 0, 0, 0, 0, 0,
390
/*     case 0x001C: * FILE SEPARATOR */
391
/*     case 0x001D: * GROUP SEPARATOR */
392
/*     case 0x001E: * RECORD SEPARATOR */
393
/*     case 0x001F: * UNIT SEPARATOR */
394
    0, 0, 0, 0, 1, 1, 1, 1,
395
/*     case 0x0020: * SPACE */
396
    1, 0, 0, 0, 0, 0, 0, 0,
397
    0, 0, 0, 0, 0, 0, 0, 0,
398
    0, 0, 0, 0, 0, 0, 0, 0,
399
    0, 0, 0, 0, 0, 0, 0, 0,
400
401
    0, 0, 0, 0, 0, 0, 0, 0,
402
    0, 0, 0, 0, 0, 0, 0, 0,
403
    0, 0, 0, 0, 0, 0, 0, 0,
404
    0, 0, 0, 0, 0, 0, 0, 0,
405
    0, 0, 0, 0, 0, 0, 0, 0,
406
    0, 0, 0, 0, 0, 0, 0, 0,
407
    0, 0, 0, 0, 0, 0, 0, 0,
408
    0, 0, 0, 0, 0, 0, 0, 0
409
};
410
411
/* forward */
412
static PyObject* get_latin1_char(unsigned char ch);
413
414
415
static PyObject *
416
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
417
static PyObject *
418
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
419
static PyObject *
420
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
421
422
static PyObject *
423
unicode_encode_call_errorhandler(const char *errors,
424
       PyObject **errorHandler,const char *encoding, const char *reason,
425
       PyObject *unicode, PyObject **exceptionObject,
426
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
427
428
static void
429
raise_encode_exception(PyObject **exceptionObject,
430
                       const char *encoding,
431
                       PyObject *unicode,
432
                       Py_ssize_t startpos, Py_ssize_t endpos,
433
                       const char *reason);
434
435
/* Same for linebreaks */
436
static const unsigned char ascii_linebreak[] = {
437
    0, 0, 0, 0, 0, 0, 0, 0,
438
/*         0x000A, * LINE FEED */
439
/*         0x000B, * LINE TABULATION */
440
/*         0x000C, * FORM FEED */
441
/*         0x000D, * CARRIAGE RETURN */
442
    0, 0, 1, 1, 1, 1, 0, 0,
443
    0, 0, 0, 0, 0, 0, 0, 0,
444
/*         0x001C, * FILE SEPARATOR */
445
/*         0x001D, * GROUP SEPARATOR */
446
/*         0x001E, * RECORD SEPARATOR */
447
    0, 0, 0, 0, 1, 1, 1, 0,
448
    0, 0, 0, 0, 0, 0, 0, 0,
449
    0, 0, 0, 0, 0, 0, 0, 0,
450
    0, 0, 0, 0, 0, 0, 0, 0,
451
    0, 0, 0, 0, 0, 0, 0, 0,
452
453
    0, 0, 0, 0, 0, 0, 0, 0,
454
    0, 0, 0, 0, 0, 0, 0, 0,
455
    0, 0, 0, 0, 0, 0, 0, 0,
456
    0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0,
458
    0, 0, 0, 0, 0, 0, 0, 0,
459
    0, 0, 0, 0, 0, 0, 0, 0,
460
    0, 0, 0, 0, 0, 0, 0, 0
461
};
462
463
static int convert_uc(PyObject *obj, void *addr);
464
465
struct encoding_map;
466
#include "clinic/unicodeobject.c.h"
467
468
_Py_error_handler
469
_Py_GetErrorHandler(const char *errors)
470
4.49M
{
471
4.49M
    if (errors == NULL || strcmp(errors, "strict") == 0) {
472
3.74M
        return _Py_ERROR_STRICT;
473
3.74M
    }
474
745k
    if (strcmp(errors, "surrogateescape") == 0) {
475
531k
        return _Py_ERROR_SURROGATEESCAPE;
476
531k
    }
477
214k
    if (strcmp(errors, "replace") == 0) {
478
214k
        return _Py_ERROR_REPLACE;
479
214k
    }
480
4
    if (strcmp(errors, "ignore") == 0) {
481
0
        return _Py_ERROR_IGNORE;
482
0
    }
483
4
    if (strcmp(errors, "backslashreplace") == 0) {
484
0
        return _Py_ERROR_BACKSLASHREPLACE;
485
0
    }
486
4
    if (strcmp(errors, "surrogatepass") == 0) {
487
4
        return _Py_ERROR_SURROGATEPASS;
488
4
    }
489
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
490
0
        return _Py_ERROR_XMLCHARREFREPLACE;
491
0
    }
492
0
    return _Py_ERROR_OTHER;
493
0
}
494
495
496
static _Py_error_handler
497
get_error_handler_wide(const wchar_t *errors)
498
72
{
499
72
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
500
0
        return _Py_ERROR_STRICT;
501
0
    }
502
72
    if (wcscmp(errors, L"surrogateescape") == 0) {
503
72
        return _Py_ERROR_SURROGATEESCAPE;
504
72
    }
505
0
    if (wcscmp(errors, L"replace") == 0) {
506
0
        return _Py_ERROR_REPLACE;
507
0
    }
508
0
    if (wcscmp(errors, L"ignore") == 0) {
509
0
        return _Py_ERROR_IGNORE;
510
0
    }
511
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
512
0
        return _Py_ERROR_BACKSLASHREPLACE;
513
0
    }
514
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
515
0
        return _Py_ERROR_SURROGATEPASS;
516
0
    }
517
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
518
0
        return _Py_ERROR_XMLCHARREFREPLACE;
519
0
    }
520
0
    return _Py_ERROR_OTHER;
521
0
}
522
523
524
static inline int
525
unicode_check_encoding_errors(const char *encoding, const char *errors)
526
42.7M
{
527
42.7M
    if (encoding == NULL && errors == NULL) {
528
12.5M
        return 0;
529
12.5M
    }
530
531
30.1M
    PyInterpreterState *interp = _PyInterpreterState_GET();
532
30.1M
#ifndef Py_DEBUG
533
    /* In release mode, only check in development mode (-X dev) */
534
30.1M
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
535
30.1M
        return 0;
536
30.1M
    }
537
#else
538
    /* Always check in debug mode */
539
#endif
540
541
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
542
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
543
0
    if (!interp->unicode.fs_codec.encoding) {
544
0
        return 0;
545
0
    }
546
547
    /* Disable checks during Python finalization. For example, it allows to
548
     * call PyObject_Dump() during finalization for debugging purpose.
549
     */
550
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
551
0
        return 0;
552
0
    }
553
554
0
    if (encoding != NULL
555
        // Fast path for the most common built-in encodings. Even if the codec
556
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
557
        // create a temporary Unicode string (the key in the cache).
558
0
        && strcmp(encoding, "utf-8") != 0
559
0
        && strcmp(encoding, "utf8") != 0
560
0
        && strcmp(encoding, "ascii") != 0)
561
0
    {
562
0
        PyObject *handler = _PyCodec_Lookup(encoding);
563
0
        if (handler == NULL) {
564
0
            return -1;
565
0
        }
566
0
        Py_DECREF(handler);
567
0
    }
568
569
0
    if (errors != NULL
570
        // Fast path for the most common built-in error handlers.
571
0
        && strcmp(errors, "strict") != 0
572
0
        && strcmp(errors, "ignore") != 0
573
0
        && strcmp(errors, "replace") != 0
574
0
        && strcmp(errors, "surrogateescape") != 0
575
0
        && strcmp(errors, "surrogatepass") != 0)
576
0
    {
577
0
        PyObject *handler = PyCodec_LookupError(errors);
578
0
        if (handler == NULL) {
579
0
            return -1;
580
0
        }
581
0
        Py_DECREF(handler);
582
0
    }
583
0
    return 0;
584
0
}
585
586
587
int
588
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
589
0
{
590
0
#define CHECK(expr) \
591
0
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
592
#ifdef Py_GIL_DISABLED
593
# define CHECK_IF_GIL(expr) (void)(expr)
594
# define CHECK_IF_FT(expr) CHECK(expr)
595
#else
596
0
# define CHECK_IF_GIL(expr) CHECK(expr)
597
0
# define CHECK_IF_FT(expr) (void)(expr)
598
0
#endif
599
600
601
0
    assert(op != NULL);
602
0
    CHECK(PyUnicode_Check(op));
603
604
0
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
605
0
    int kind = ascii->state.kind;
606
607
0
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
608
0
        CHECK(kind == PyUnicode_1BYTE_KIND);
609
0
    }
610
0
    else {
611
0
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
612
0
        void *data;
613
614
0
        if (ascii->state.compact == 1) {
615
0
            data = compact + 1;
616
0
            CHECK(kind == PyUnicode_1BYTE_KIND
617
0
                                 || kind == PyUnicode_2BYTE_KIND
618
0
                                 || kind == PyUnicode_4BYTE_KIND);
619
0
            CHECK(ascii->state.ascii == 0);
620
0
            CHECK(_PyUnicode_UTF8(op) != data);
621
0
        }
622
0
        else {
623
0
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
624
625
0
            data = unicode->data.any;
626
0
            CHECK(kind == PyUnicode_1BYTE_KIND
627
0
                     || kind == PyUnicode_2BYTE_KIND
628
0
                     || kind == PyUnicode_4BYTE_KIND);
629
0
            CHECK(ascii->state.compact == 0);
630
0
            CHECK(data != NULL);
631
0
            if (ascii->state.ascii) {
632
0
                CHECK(_PyUnicode_UTF8(op) == data);
633
0
                CHECK(compact->utf8_length == ascii->length);
634
0
            }
635
0
            else {
636
0
                CHECK(_PyUnicode_UTF8(op) != data);
637
0
            }
638
0
        }
639
0
#ifndef Py_GIL_DISABLED
640
0
        if (_PyUnicode_UTF8(op) == NULL)
641
0
            CHECK(compact->utf8_length == 0);
642
0
#endif
643
0
    }
644
645
    /* check that the best kind is used: O(n) operation */
646
0
    if (check_content) {
647
0
        Py_ssize_t i;
648
0
        Py_UCS4 maxchar = 0;
649
0
        const void *data;
650
0
        Py_UCS4 ch;
651
652
0
        data = PyUnicode_DATA(ascii);
653
0
        for (i=0; i < ascii->length; i++)
654
0
        {
655
0
            ch = PyUnicode_READ(kind, data, i);
656
0
            if (ch > maxchar)
657
0
                maxchar = ch;
658
0
        }
659
0
        if (kind == PyUnicode_1BYTE_KIND) {
660
0
            if (ascii->state.ascii == 0) {
661
0
                CHECK(maxchar >= 128);
662
0
                CHECK(maxchar <= 255);
663
0
            }
664
0
            else
665
0
                CHECK(maxchar < 128);
666
0
        }
667
0
        else if (kind == PyUnicode_2BYTE_KIND) {
668
0
            CHECK(maxchar >= 0x100);
669
0
            CHECK(maxchar <= 0xFFFF);
670
0
        }
671
0
        else {
672
0
            CHECK(maxchar >= 0x10000);
673
0
            CHECK(maxchar <= MAX_UNICODE);
674
0
        }
675
0
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
676
0
    }
677
678
    /* Check interning state */
679
#ifdef Py_DEBUG
680
    // Note that we do not check `_Py_IsImmortal(op)` in the GIL-enabled build
681
    // since stable ABI extensions can make immortal strings mortal (but with a
682
    // high enough refcount).
683
    switch (PyUnicode_CHECK_INTERNED(op)) {
684
        case SSTATE_NOT_INTERNED:
685
            if (ascii->state.statically_allocated) {
686
                // This state is for two exceptions:
687
                // - strings are currently checked before they're interned
688
                // - the 256 one-latin1-character strings
689
                //   are static but use SSTATE_NOT_INTERNED
690
            }
691
            else {
692
                CHECK_IF_GIL(!_Py_IsImmortal(op));
693
            }
694
            break;
695
        case SSTATE_INTERNED_MORTAL:
696
            CHECK(!ascii->state.statically_allocated);
697
            CHECK_IF_GIL(!_Py_IsImmortal(op));
698
            break;
699
        case SSTATE_INTERNED_IMMORTAL:
700
            CHECK(!ascii->state.statically_allocated);
701
            CHECK_IF_FT(_Py_IsImmortal(op));
702
            break;
703
        case SSTATE_INTERNED_IMMORTAL_STATIC:
704
            CHECK(ascii->state.statically_allocated);
705
            CHECK_IF_FT(_Py_IsImmortal(op));
706
            break;
707
        default:
708
            Py_UNREACHABLE();
709
    }
710
#endif
711
712
0
    return 1;
713
714
0
#undef CHECK
715
0
}
716
717
PyObject*
718
_PyUnicode_Result(PyObject *unicode)
719
54.4M
{
720
54.4M
    assert(_PyUnicode_CHECK(unicode));
721
722
54.4M
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
723
54.4M
    if (length == 0) {
724
230
        PyObject *empty = _PyUnicode_GetEmpty();
725
230
        if (unicode != empty) {
726
0
            Py_DECREF(unicode);
727
0
        }
728
230
        return empty;
729
230
    }
730
731
54.4M
    if (length == 1) {
732
2.80M
        int kind = PyUnicode_KIND(unicode);
733
2.80M
        if (kind == PyUnicode_1BYTE_KIND) {
734
131k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
735
131k
            Py_UCS1 ch = data[0];
736
131k
            PyObject *latin1_char = LATIN1(ch);
737
131k
            if (unicode != latin1_char) {
738
127k
                Py_DECREF(unicode);
739
127k
            }
740
131k
            return latin1_char;
741
131k
        }
742
2.80M
    }
743
744
54.4M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
745
54.3M
    return unicode;
746
54.4M
}
747
1.67M
#define unicode_result _PyUnicode_Result
748
749
static PyObject*
750
unicode_result_unchanged(PyObject *unicode)
751
76.9M
{
752
76.9M
    if (PyUnicode_CheckExact(unicode)) {
753
73.9M
        return Py_NewRef(unicode);
754
73.9M
    }
755
2.97M
    else
756
        /* Subtype -- return genuine unicode string with the same value. */
757
2.97M
        return _PyUnicode_Copy(unicode);
758
76.9M
}
759
760
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
761
   ASCII, Latin1, UTF-8, etc. */
762
static char*
763
backslashreplace(PyBytesWriter *writer, char *str,
764
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
765
0
{
766
0
    Py_ssize_t size, i;
767
0
    Py_UCS4 ch;
768
0
    int kind;
769
0
    const void *data;
770
771
0
    kind = PyUnicode_KIND(unicode);
772
0
    data = PyUnicode_DATA(unicode);
773
774
0
    size = 0;
775
    /* determine replacement size */
776
0
    for (i = collstart; i < collend; ++i) {
777
0
        Py_ssize_t incr;
778
779
0
        ch = PyUnicode_READ(kind, data, i);
780
0
        if (ch < 0x100)
781
0
            incr = 2+2;
782
0
        else if (ch < 0x10000)
783
0
            incr = 2+4;
784
0
        else {
785
0
            assert(ch <= MAX_UNICODE);
786
0
            incr = 2+8;
787
0
        }
788
0
        if (size > PY_SSIZE_T_MAX - incr) {
789
0
            PyErr_SetString(PyExc_OverflowError,
790
0
                            "encoded result is too long for a Python string");
791
0
            return NULL;
792
0
        }
793
0
        size += incr;
794
0
    }
795
796
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
797
0
    if (str == NULL) {
798
0
        return NULL;
799
0
    }
800
801
    /* generate replacement */
802
0
    for (i = collstart; i < collend; ++i) {
803
0
        ch = PyUnicode_READ(kind, data, i);
804
0
        *str++ = '\\';
805
0
        if (ch >= 0x00010000) {
806
0
            *str++ = 'U';
807
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
808
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
809
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
810
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
811
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
812
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
813
0
        }
814
0
        else if (ch >= 0x100) {
815
0
            *str++ = 'u';
816
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
817
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
818
0
        }
819
0
        else
820
0
            *str++ = 'x';
821
0
        *str++ = Py_hexdigits[(ch>>4)&0xf];
822
0
        *str++ = Py_hexdigits[ch&0xf];
823
0
    }
824
0
    return str;
825
0
}
826
827
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
828
   ASCII, Latin1, UTF-8, etc. */
829
static char*
830
xmlcharrefreplace(PyBytesWriter *writer, char *str,
831
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
832
0
{
833
0
    Py_ssize_t size, i;
834
0
    Py_UCS4 ch;
835
0
    int kind;
836
0
    const void *data;
837
838
0
    kind = PyUnicode_KIND(unicode);
839
0
    data = PyUnicode_DATA(unicode);
840
841
0
    size = 0;
842
    /* determine replacement size */
843
0
    for (i = collstart; i < collend; ++i) {
844
0
        Py_ssize_t incr;
845
846
0
        ch = PyUnicode_READ(kind, data, i);
847
0
        if (ch < 10)
848
0
            incr = 2+1+1;
849
0
        else if (ch < 100)
850
0
            incr = 2+2+1;
851
0
        else if (ch < 1000)
852
0
            incr = 2+3+1;
853
0
        else if (ch < 10000)
854
0
            incr = 2+4+1;
855
0
        else if (ch < 100000)
856
0
            incr = 2+5+1;
857
0
        else if (ch < 1000000)
858
0
            incr = 2+6+1;
859
0
        else {
860
0
            assert(ch <= MAX_UNICODE);
861
0
            incr = 2+7+1;
862
0
        }
863
0
        if (size > PY_SSIZE_T_MAX - incr) {
864
0
            PyErr_SetString(PyExc_OverflowError,
865
0
                            "encoded result is too long for a Python string");
866
0
            return NULL;
867
0
        }
868
0
        size += incr;
869
0
    }
870
871
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
872
0
    if (str == NULL) {
873
0
        return NULL;
874
0
    }
875
876
    /* generate replacement */
877
0
    for (i = collstart; i < collend; ++i) {
878
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
879
0
        if (size < 0) {
880
0
            return NULL;
881
0
        }
882
0
        str += size;
883
0
    }
884
0
    return str;
885
0
}
886
887
/* --- Bloom Filters ----------------------------------------------------- */
888
889
/* stuff to implement simple "bloom filters" for Unicode characters.
890
   to keep things simple, we use a single bitmask, using the least 5
891
   bits from each unicode characters as the bit index. */
892
893
/* the linebreak mask is set up by _PyUnicode_Init() below */
894
895
#if LONG_BIT >= 128
896
#define BLOOM_WIDTH 128
897
#elif LONG_BIT >= 64
898
24.9M
#define BLOOM_WIDTH 64
899
#elif LONG_BIT >= 32
900
#define BLOOM_WIDTH 32
901
#else
902
#error "LONG_BIT is smaller than 32"
903
#endif
904
905
8.86M
#define BLOOM_MASK unsigned long
906
907
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
908
909
33.6M
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
910
911
#define BLOOM_LINEBREAK(ch)                                             \
912
137M
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
913
137M
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
914
915
static inline BLOOM_MASK
916
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
917
4.43M
{
918
4.43M
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
919
4.43M
    do {                                               \
920
4.43M
        TYPE *data = (TYPE *)PTR;                      \
921
4.43M
        TYPE *end = data + LEN;                        \
922
4.43M
        Py_UCS4 ch;                                    \
923
10.5M
        for (; data != end; data++) {                  \
924
6.15M
            ch = *data;                                \
925
6.15M
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
926
6.15M
        }                                              \
927
4.43M
        break;                                         \
928
4.43M
    } while (0)
929
930
    /* calculate simple bloom-style bitmask for a given unicode string */
931
932
4.43M
    BLOOM_MASK mask;
933
934
4.43M
    mask = 0;
935
4.43M
    switch (kind) {
936
4.43M
    case PyUnicode_1BYTE_KIND:
937
4.43M
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
938
4.43M
        break;
939
36
    case PyUnicode_2BYTE_KIND:
940
36
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
941
36
        break;
942
0
    case PyUnicode_4BYTE_KIND:
943
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
944
0
        break;
945
0
    default:
946
0
        Py_UNREACHABLE();
947
4.43M
    }
948
4.43M
    return mask;
949
950
4.43M
#undef BLOOM_UPDATE
951
4.43M
}
952
953
/* Compilation of templated routines */
954
955
784k
#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
956
957
#include "stringlib/asciilib.h"
958
#include "stringlib/fastsearch.h"
959
#include "stringlib/partition.h"
960
#include "stringlib/split.h"
961
#include "stringlib/count.h"
962
#include "stringlib/find.h"
963
#include "stringlib/find_max_char.h"
964
#include "stringlib/undef.h"
965
966
#include "stringlib/ucs1lib.h"
967
#include "stringlib/fastsearch.h"
968
#include "stringlib/partition.h"
969
#include "stringlib/split.h"
970
#include "stringlib/count.h"
971
#include "stringlib/find.h"
972
#include "stringlib/replace.h"
973
#include "stringlib/repr.h"
974
#include "stringlib/find_max_char.h"
975
#include "stringlib/undef.h"
976
977
#include "stringlib/ucs2lib.h"
978
#include "stringlib/fastsearch.h"
979
#include "stringlib/partition.h"
980
#include "stringlib/split.h"
981
#include "stringlib/count.h"
982
#include "stringlib/find.h"
983
#include "stringlib/replace.h"
984
#include "stringlib/repr.h"
985
#include "stringlib/find_max_char.h"
986
#include "stringlib/undef.h"
987
988
#include "stringlib/ucs4lib.h"
989
#include "stringlib/fastsearch.h"
990
#include "stringlib/partition.h"
991
#include "stringlib/split.h"
992
#include "stringlib/count.h"
993
#include "stringlib/find.h"
994
#include "stringlib/replace.h"
995
#include "stringlib/repr.h"
996
#include "stringlib/find_max_char.h"
997
#include "stringlib/undef.h"
998
999
#undef STRINGLIB_GET_EMPTY
1000
1001
/* --- Unicode Object ----------------------------------------------------- */
1002
1003
static inline Py_ssize_t
1004
findchar(const void *s, int kind,
1005
         Py_ssize_t size, Py_UCS4 ch,
1006
         int direction)
1007
217M
{
1008
217M
    switch (kind) {
1009
209M
    case PyUnicode_1BYTE_KIND:
1010
209M
        if ((Py_UCS1) ch != ch)
1011
3.70k
            return -1;
1012
209M
        if (direction > 0)
1013
209M
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1014
69.2k
        else
1015
69.2k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1016
5.21M
    case PyUnicode_2BYTE_KIND:
1017
5.21M
        if ((Py_UCS2) ch != ch)
1018
0
            return -1;
1019
5.21M
        if (direction > 0)
1020
5.00M
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1021
214k
        else
1022
214k
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1023
2.73M
    case PyUnicode_4BYTE_KIND:
1024
2.73M
        if (direction > 0)
1025
2.60M
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1026
129k
        else
1027
129k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1028
0
    default:
1029
0
        Py_UNREACHABLE();
1030
217M
    }
1031
217M
}
1032
1033
#ifdef Py_DEBUG
1034
/* Fill the data of a Unicode string with invalid characters to detect bugs
1035
   earlier.
1036
1037
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1038
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1039
   invalid character in Unicode 6.0. */
1040
static void
1041
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1042
{
1043
    int kind = PyUnicode_KIND(unicode);
1044
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1045
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1046
    if (length <= old_length)
1047
        return;
1048
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1049
}
1050
#endif
1051
1052
static PyObject*
1053
resize_copy(PyObject *unicode, Py_ssize_t length)
1054
0
{
1055
0
    Py_ssize_t copy_length;
1056
0
    PyObject *copy;
1057
1058
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1059
0
    if (copy == NULL)
1060
0
        return NULL;
1061
1062
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1063
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1064
0
    return copy;
1065
0
}
1066
1067
PyObject*
1068
_PyUnicode_ResizeCompact(PyObject *unicode, Py_ssize_t length)
1069
59.7M
{
1070
59.7M
    Py_ssize_t char_size;
1071
59.7M
    Py_ssize_t struct_size;
1072
59.7M
    Py_ssize_t new_size;
1073
59.7M
    PyObject *new_unicode;
1074
#ifdef Py_DEBUG
1075
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1076
#endif
1077
1078
59.7M
    if (!_PyUnicode_IsModifiable(unicode)) {
1079
0
        PyObject *copy = resize_copy(unicode, length);
1080
0
        if (copy == NULL) {
1081
0
            return NULL;
1082
0
        }
1083
0
        Py_DECREF(unicode);
1084
0
        return copy;
1085
0
    }
1086
59.7M
    assert(PyUnicode_IS_COMPACT(unicode));
1087
1088
59.7M
    char_size = PyUnicode_KIND(unicode);
1089
59.7M
    if (PyUnicode_IS_ASCII(unicode))
1090
39.3M
        struct_size = sizeof(PyASCIIObject);
1091
20.4M
    else
1092
20.4M
        struct_size = sizeof(PyCompactUnicodeObject);
1093
1094
59.7M
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1095
0
        PyErr_NoMemory();
1096
0
        return NULL;
1097
0
    }
1098
59.7M
    new_size = (struct_size + (length + 1) * char_size);
1099
1100
59.7M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1101
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1102
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1103
0
        PyUnicode_SET_UTF8(unicode, NULL);
1104
0
    }
1105
#ifdef Py_TRACE_REFS
1106
    _Py_ForgetReference(unicode);
1107
#endif
1108
59.7M
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1109
1110
59.7M
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1111
59.7M
    if (new_unicode == NULL) {
1112
0
        _Py_NewReferenceNoTotal(unicode);
1113
0
        PyErr_NoMemory();
1114
0
        return NULL;
1115
0
    }
1116
59.7M
    unicode = new_unicode;
1117
59.7M
    _Py_NewReferenceNoTotal(unicode);
1118
1119
59.7M
    _PyUnicode_LENGTH(unicode) = length;
1120
#ifdef Py_DEBUG
1121
    unicode_fill_invalid(unicode, old_length);
1122
#endif
1123
59.7M
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1124
59.7M
                    length, 0);
1125
59.7M
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1126
59.7M
    return unicode;
1127
59.7M
}
1128
1129
static int
1130
resize_inplace(PyObject *unicode, Py_ssize_t length)
1131
0
{
1132
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1133
0
    assert(Py_REFCNT(unicode) == 1);
1134
1135
0
    Py_ssize_t new_size;
1136
0
    Py_ssize_t char_size;
1137
0
    int share_utf8;
1138
0
    void *data;
1139
#ifdef Py_DEBUG
1140
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1141
#endif
1142
1143
0
    data = _PyUnicode_DATA_ANY(unicode);
1144
0
    char_size = PyUnicode_KIND(unicode);
1145
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1146
1147
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1148
0
        PyErr_NoMemory();
1149
0
        return -1;
1150
0
    }
1151
0
    new_size = (length + 1) * char_size;
1152
1153
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1154
0
    {
1155
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1156
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1157
0
        PyUnicode_SET_UTF8(unicode, NULL);
1158
0
    }
1159
1160
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1161
0
    if (data == NULL) {
1162
0
        PyErr_NoMemory();
1163
0
        return -1;
1164
0
    }
1165
0
    _PyUnicode_DATA_ANY(unicode) = data;
1166
0
    if (share_utf8) {
1167
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1168
0
        PyUnicode_SET_UTF8(unicode, data);
1169
0
    }
1170
0
    _PyUnicode_LENGTH(unicode) = length;
1171
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1172
#ifdef Py_DEBUG
1173
    unicode_fill_invalid(unicode, old_length);
1174
#endif
1175
1176
    /* check for integer overflow */
1177
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1178
0
        PyErr_NoMemory();
1179
0
        return -1;
1180
0
    }
1181
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1182
0
    return 0;
1183
0
}
1184
1185
static const char*
1186
unicode_kind_name(PyObject *unicode)
1187
0
{
1188
    /* don't check consistency: unicode_kind_name() is called from
1189
       _PyUnicode_Dump() */
1190
0
    if (!PyUnicode_IS_COMPACT(unicode))
1191
0
    {
1192
0
        switch (PyUnicode_KIND(unicode))
1193
0
        {
1194
0
        case PyUnicode_1BYTE_KIND:
1195
0
            if (PyUnicode_IS_ASCII(unicode))
1196
0
                return "legacy ascii";
1197
0
            else
1198
0
                return "legacy latin1";
1199
0
        case PyUnicode_2BYTE_KIND:
1200
0
            return "legacy UCS2";
1201
0
        case PyUnicode_4BYTE_KIND:
1202
0
            return "legacy UCS4";
1203
0
        default:
1204
0
            return "<legacy invalid kind>";
1205
0
        }
1206
0
    }
1207
0
    switch (PyUnicode_KIND(unicode)) {
1208
0
    case PyUnicode_1BYTE_KIND:
1209
0
        if (PyUnicode_IS_ASCII(unicode))
1210
0
            return "ascii";
1211
0
        else
1212
0
            return "latin1";
1213
0
    case PyUnicode_2BYTE_KIND:
1214
0
        return "UCS2";
1215
0
    case PyUnicode_4BYTE_KIND:
1216
0
        return "UCS4";
1217
0
    default:
1218
0
        return "<invalid compact kind>";
1219
0
    }
1220
0
}
1221
1222
#ifdef Py_DEBUG
1223
/* Functions wrapping macros for use in debugger */
1224
const char *_PyUnicode_utf8(void *unicode_raw){
1225
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1226
    return PyUnicode_UTF8(unicode);
1227
}
1228
1229
const void *_PyUnicode_compact_data(void *unicode_raw) {
1230
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1231
    return _PyUnicode_COMPACT_DATA(unicode);
1232
}
1233
const void *_PyUnicode_data(void *unicode_raw) {
1234
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1235
    printf("obj %p\n", (void*)unicode);
1236
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1237
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1238
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1239
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1240
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1241
    return PyUnicode_DATA(unicode);
1242
}
1243
1244
void
1245
_PyUnicode_Dump(PyObject *op)
1246
{
1247
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1248
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1249
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1250
    const void *data;
1251
1252
    if (ascii->state.compact)
1253
    {
1254
        if (ascii->state.ascii)
1255
            data = (ascii + 1);
1256
        else
1257
            data = (compact + 1);
1258
    }
1259
    else
1260
        data = unicode->data.any;
1261
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1262
1263
    if (!ascii->state.ascii) {
1264
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1265
    }
1266
    printf(", data=%p\n", data);
1267
}
1268
#endif
1269
1270
1271
PyObject *
1272
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1273
549M
{
1274
    /* Optimization for empty strings */
1275
549M
    if (size == 0) {
1276
16.6M
        return _PyUnicode_GetEmpty();
1277
16.6M
    }
1278
1279
532M
    PyObject *obj;
1280
532M
    PyCompactUnicodeObject *unicode;
1281
532M
    void *data;
1282
532M
    int kind;
1283
532M
    int is_ascii;
1284
532M
    Py_ssize_t char_size;
1285
532M
    Py_ssize_t struct_size;
1286
1287
532M
    is_ascii = 0;
1288
532M
    struct_size = sizeof(PyCompactUnicodeObject);
1289
532M
    if (maxchar < 128) {
1290
338M
        kind = PyUnicode_1BYTE_KIND;
1291
338M
        char_size = 1;
1292
338M
        is_ascii = 1;
1293
338M
        struct_size = sizeof(PyASCIIObject);
1294
338M
    }
1295
193M
    else if (maxchar < 256) {
1296
13.9M
        kind = PyUnicode_1BYTE_KIND;
1297
13.9M
        char_size = 1;
1298
13.9M
    }
1299
180M
    else if (maxchar < 65536) {
1300
168M
        kind = PyUnicode_2BYTE_KIND;
1301
168M
        char_size = 2;
1302
168M
    }
1303
11.9M
    else {
1304
11.9M
        if (maxchar > MAX_UNICODE) {
1305
0
            PyErr_SetString(PyExc_SystemError,
1306
0
                            "invalid maximum character passed to PyUnicode_New");
1307
0
            return NULL;
1308
0
        }
1309
11.9M
        kind = PyUnicode_4BYTE_KIND;
1310
11.9M
        char_size = 4;
1311
11.9M
    }
1312
1313
    /* Ensure we won't overflow the size. */
1314
532M
    if (size < 0) {
1315
0
        PyErr_SetString(PyExc_SystemError,
1316
0
                        "Negative size passed to PyUnicode_New");
1317
0
        return NULL;
1318
0
    }
1319
532M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1320
0
        return PyErr_NoMemory();
1321
1322
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1323
     * PyObject_New() so we are able to allocate space for the object and
1324
     * it's data buffer.
1325
     */
1326
532M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1327
532M
    if (obj == NULL) {
1328
0
        return PyErr_NoMemory();
1329
0
    }
1330
532M
    _PyObject_Init(obj, &PyUnicode_Type);
1331
1332
532M
    unicode = (PyCompactUnicodeObject *)obj;
1333
532M
    if (is_ascii)
1334
338M
        data = ((PyASCIIObject*)obj) + 1;
1335
193M
    else
1336
193M
        data = unicode + 1;
1337
532M
    _PyUnicode_LENGTH(unicode) = size;
1338
532M
    _PyUnicode_HASH(unicode) = -1;
1339
532M
    _PyUnicode_STATE(unicode).interned = 0;
1340
532M
    _PyUnicode_STATE(unicode).kind = kind;
1341
532M
    _PyUnicode_STATE(unicode).compact = 1;
1342
532M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1343
532M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1344
532M
    if (is_ascii) {
1345
338M
        ((char*)data)[size] = 0;
1346
338M
    }
1347
193M
    else if (kind == PyUnicode_1BYTE_KIND) {
1348
13.9M
        ((char*)data)[size] = 0;
1349
13.9M
        unicode->utf8 = NULL;
1350
13.9M
        unicode->utf8_length = 0;
1351
13.9M
    }
1352
180M
    else {
1353
180M
        unicode->utf8 = NULL;
1354
180M
        unicode->utf8_length = 0;
1355
180M
        if (kind == PyUnicode_2BYTE_KIND)
1356
168M
            ((Py_UCS2*)data)[size] = 0;
1357
11.9M
        else /* kind == PyUnicode_4BYTE_KIND */
1358
11.9M
            ((Py_UCS4*)data)[size] = 0;
1359
180M
    }
1360
#ifdef Py_DEBUG
1361
    unicode_fill_invalid((PyObject*)unicode, 0);
1362
#endif
1363
532M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1364
532M
    return obj;
1365
532M
}
1366
1367
static int
1368
unicode_check_modifiable(PyObject *unicode)
1369
652
{
1370
652
    if (!_PyUnicode_IsModifiable(unicode)) {
1371
0
        PyErr_SetString(PyExc_SystemError,
1372
0
                        "Cannot modify a string currently used");
1373
0
        return -1;
1374
0
    }
1375
652
    return 0;
1376
652
}
1377
1378
static int
1379
_copy_characters(PyObject *to, Py_ssize_t to_start,
1380
                 PyObject *from, Py_ssize_t from_start,
1381
                 Py_ssize_t how_many, int check_maxchar)
1382
274M
{
1383
274M
    int from_kind, to_kind;
1384
274M
    const void *from_data;
1385
274M
    void *to_data;
1386
1387
274M
    assert(0 <= how_many);
1388
274M
    assert(0 <= from_start);
1389
274M
    assert(0 <= to_start);
1390
274M
    assert(PyUnicode_Check(from));
1391
274M
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1392
1393
274M
    assert(to == NULL || PyUnicode_Check(to));
1394
1395
274M
    if (how_many == 0) {
1396
4.97M
        return 0;
1397
4.97M
    }
1398
1399
274M
    assert(to != NULL);
1400
269M
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1401
1402
269M
    from_kind = PyUnicode_KIND(from);
1403
269M
    from_data = PyUnicode_DATA(from);
1404
269M
    to_kind = PyUnicode_KIND(to);
1405
269M
    to_data = PyUnicode_DATA(to);
1406
1407
#ifdef Py_DEBUG
1408
    if (!check_maxchar
1409
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1410
    {
1411
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1412
        Py_UCS4 ch;
1413
        Py_ssize_t i;
1414
        for (i=0; i < how_many; i++) {
1415
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1416
            assert(ch <= to_maxchar);
1417
        }
1418
    }
1419
#endif
1420
1421
269M
    if (from_kind == to_kind) {
1422
164M
        if (check_maxchar
1423
0
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1424
0
        {
1425
            /* Writing Latin-1 characters into an ASCII string requires to
1426
               check that all written characters are pure ASCII */
1427
0
            Py_UCS4 max_char;
1428
0
            max_char = ucs1lib_find_max_char(from_data,
1429
0
                                             (const Py_UCS1*)from_data + how_many);
1430
0
            if (max_char >= 128)
1431
0
                return -1;
1432
0
        }
1433
164M
        memcpy((char*)to_data + to_kind * to_start,
1434
164M
                  (const char*)from_data + from_kind * from_start,
1435
164M
                  to_kind * how_many);
1436
164M
    }
1437
105M
    else if (from_kind == PyUnicode_1BYTE_KIND
1438
102M
             && to_kind == PyUnicode_2BYTE_KIND)
1439
85.8M
    {
1440
85.8M
        _PyUnicode_CONVERT_BYTES(
1441
85.8M
            Py_UCS1, Py_UCS2,
1442
85.8M
            PyUnicode_1BYTE_DATA(from) + from_start,
1443
85.8M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1444
85.8M
            PyUnicode_2BYTE_DATA(to) + to_start
1445
85.8M
            );
1446
85.8M
    }
1447
19.1M
    else if (from_kind == PyUnicode_1BYTE_KIND
1448
16.2M
             && to_kind == PyUnicode_4BYTE_KIND)
1449
16.2M
    {
1450
16.2M
        _PyUnicode_CONVERT_BYTES(
1451
16.2M
            Py_UCS1, Py_UCS4,
1452
16.2M
            PyUnicode_1BYTE_DATA(from) + from_start,
1453
16.2M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1454
16.2M
            PyUnicode_4BYTE_DATA(to) + to_start
1455
16.2M
            );
1456
16.2M
    }
1457
2.92M
    else if (from_kind == PyUnicode_2BYTE_KIND
1458
2.91M
             && to_kind == PyUnicode_4BYTE_KIND)
1459
2.90M
    {
1460
2.90M
        _PyUnicode_CONVERT_BYTES(
1461
2.90M
            Py_UCS2, Py_UCS4,
1462
2.90M
            PyUnicode_2BYTE_DATA(from) + from_start,
1463
2.90M
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1464
2.90M
            PyUnicode_4BYTE_DATA(to) + to_start
1465
2.90M
            );
1466
2.90M
    }
1467
17.4k
    else {
1468
17.4k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1469
1470
17.4k
        if (!check_maxchar) {
1471
17.4k
            if (from_kind == PyUnicode_2BYTE_KIND
1472
3.51k
                && to_kind == PyUnicode_1BYTE_KIND)
1473
3.51k
            {
1474
3.51k
                _PyUnicode_CONVERT_BYTES(
1475
3.51k
                    Py_UCS2, Py_UCS1,
1476
3.51k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1477
3.51k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1478
3.51k
                    PyUnicode_1BYTE_DATA(to) + to_start
1479
3.51k
                    );
1480
3.51k
            }
1481
13.9k
            else if (from_kind == PyUnicode_4BYTE_KIND
1482
13.9k
                     && to_kind == PyUnicode_1BYTE_KIND)
1483
6.74k
            {
1484
6.74k
                _PyUnicode_CONVERT_BYTES(
1485
6.74k
                    Py_UCS4, Py_UCS1,
1486
6.74k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1487
6.74k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1488
6.74k
                    PyUnicode_1BYTE_DATA(to) + to_start
1489
6.74k
                    );
1490
6.74k
            }
1491
7.20k
            else if (from_kind == PyUnicode_4BYTE_KIND
1492
7.20k
                     && to_kind == PyUnicode_2BYTE_KIND)
1493
7.20k
            {
1494
7.20k
                _PyUnicode_CONVERT_BYTES(
1495
7.20k
                    Py_UCS4, Py_UCS2,
1496
7.20k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1497
7.20k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1498
7.20k
                    PyUnicode_2BYTE_DATA(to) + to_start
1499
7.20k
                    );
1500
7.20k
            }
1501
0
            else {
1502
0
                Py_UNREACHABLE();
1503
0
            }
1504
17.4k
        }
1505
0
        else {
1506
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1507
0
            Py_UCS4 ch;
1508
0
            Py_ssize_t i;
1509
1510
0
            for (i=0; i < how_many; i++) {
1511
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1512
0
                if (ch > to_maxchar)
1513
0
                    return -1;
1514
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1515
0
            }
1516
0
        }
1517
17.4k
    }
1518
269M
    return 0;
1519
269M
}
1520
1521
void
1522
_PyUnicode_FastCopyCharacters(
1523
    PyObject *to, Py_ssize_t to_start,
1524
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1525
274M
{
1526
274M
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1527
274M
}
1528
1529
Py_ssize_t
1530
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1531
                         PyObject *from, Py_ssize_t from_start,
1532
                         Py_ssize_t how_many)
1533
0
{
1534
0
    int err;
1535
1536
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1537
0
        PyErr_BadInternalCall();
1538
0
        return -1;
1539
0
    }
1540
1541
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1542
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1543
0
        return -1;
1544
0
    }
1545
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1546
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1547
0
        return -1;
1548
0
    }
1549
0
    if (how_many < 0) {
1550
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1551
0
        return -1;
1552
0
    }
1553
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1554
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1555
0
        PyErr_Format(PyExc_SystemError,
1556
0
                     "Cannot write %zi characters at %zi "
1557
0
                     "in a string of %zi characters",
1558
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1559
0
        return -1;
1560
0
    }
1561
1562
0
    if (how_many == 0)
1563
0
        return 0;
1564
1565
0
    if (unicode_check_modifiable(to))
1566
0
        return -1;
1567
1568
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1569
0
    if (err) {
1570
0
        PyErr_Format(PyExc_SystemError,
1571
0
                     "Cannot copy %s characters "
1572
0
                     "into a string of %s characters",
1573
0
                     unicode_kind_name(from),
1574
0
                     unicode_kind_name(to));
1575
0
        return -1;
1576
0
    }
1577
0
    return how_many;
1578
0
}
1579
1580
/* Find the maximum code point and count the number of surrogate pairs so a
1581
   correct string length can be computed before converting a string to UCS4.
1582
   This function counts single surrogates as a character and not as a pair.
1583
1584
   Return 0 on success, or -1 on error. */
1585
static int
1586
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1587
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1588
479k
{
1589
479k
    const wchar_t *iter;
1590
479k
    Py_UCS4 ch;
1591
1592
479k
    assert(num_surrogates != NULL && maxchar != NULL);
1593
479k
    *num_surrogates = 0;
1594
479k
    *maxchar = 0;
1595
1596
14.2M
    for (iter = begin; iter < end; ) {
1597
#if SIZEOF_WCHAR_T == 2
1598
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1599
            && (iter+1) < end
1600
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1601
        {
1602
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1603
            ++(*num_surrogates);
1604
            iter += 2;
1605
        }
1606
        else
1607
#endif
1608
13.8M
        {
1609
13.8M
            ch = *iter;
1610
13.8M
            iter++;
1611
13.8M
        }
1612
13.8M
        if (ch > *maxchar) {
1613
1.99M
            *maxchar = ch;
1614
1.99M
            if (*maxchar > MAX_UNICODE) {
1615
0
                PyErr_Format(PyExc_ValueError,
1616
0
                             "character U+%x is not in range [U+0000; U+%x]",
1617
0
                             ch, MAX_UNICODE);
1618
0
                return -1;
1619
0
            }
1620
1.99M
        }
1621
13.8M
    }
1622
479k
    return 0;
1623
479k
}
1624
1625
static void
1626
unicode_dealloc(PyObject *unicode)
1627
541M
{
1628
#ifdef Py_DEBUG
1629
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1630
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1631
    }
1632
#endif
1633
541M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1634
        /* This should never get called, but we also don't want to SEGV if
1635
        * we accidentally decref an immortal string out of existence. Since
1636
        * the string is an immortal object, just re-set the reference count.
1637
        */
1638
#ifdef Py_DEBUG
1639
        Py_UNREACHABLE();
1640
#endif
1641
0
        _Py_SetImmortal(unicode);
1642
0
        return;
1643
0
    }
1644
541M
    switch (_PyUnicode_STATE(unicode).interned) {
1645
541M
        case SSTATE_NOT_INTERNED:
1646
541M
            break;
1647
207k
        case SSTATE_INTERNED_MORTAL:
1648
            /* Remove the object from the intern dict.
1649
             * Before doing so, we set the refcount to 2: the key and value
1650
             * in the interned_dict.
1651
             */
1652
207k
            assert(Py_REFCNT(unicode) == 0);
1653
207k
            Py_SET_REFCNT(unicode, 2);
1654
#ifdef Py_REF_DEBUG
1655
            /* let's be pedantic with the ref total */
1656
            _Py_IncRefTotal(_PyThreadState_GET());
1657
            _Py_IncRefTotal(_PyThreadState_GET());
1658
#endif
1659
207k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1660
207k
            PyObject *interned = get_interned_dict(interp);
1661
207k
            assert(interned != NULL);
1662
207k
            PyObject *popped;
1663
207k
            int r = PyDict_Pop(interned, unicode, &popped);
1664
207k
            if (r == -1) {
1665
0
                PyErr_FormatUnraisable("Exception ignored while "
1666
0
                                       "removing an interned string %R",
1667
0
                                       unicode);
1668
                // We don't know what happened to the string. It's probably
1669
                // best to leak it:
1670
                // - if it was popped, there are no more references to it
1671
                //   so it can't cause trouble (except wasted memory)
1672
                // - if it wasn't popped, it'll remain interned
1673
0
                _Py_SetImmortal(unicode);
1674
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1675
0
                return;
1676
0
            }
1677
207k
            if (r == 0) {
1678
                // The interned string was not found in the interned_dict.
1679
#ifdef Py_DEBUG
1680
                Py_UNREACHABLE();
1681
#endif
1682
0
                _Py_SetImmortal(unicode);
1683
0
                return;
1684
0
            }
1685
            // Successfully popped.
1686
207k
            assert(popped == unicode);
1687
            // Only our `popped` reference should be left; remove it too.
1688
207k
            assert(Py_REFCNT(unicode) == 1);
1689
207k
            Py_SET_REFCNT(unicode, 0);
1690
#ifdef Py_REF_DEBUG
1691
            /* let's be pedantic with the ref total */
1692
            _Py_DecRefTotal(_PyThreadState_GET());
1693
#endif
1694
207k
            break;
1695
0
        default:
1696
            // As with `statically_allocated` above.
1697
#ifdef Py_REF_DEBUG
1698
            Py_UNREACHABLE();
1699
#endif
1700
0
            _Py_SetImmortal(unicode);
1701
0
            return;
1702
541M
    }
1703
541M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1704
155k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1705
155k
    }
1706
541M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1707
9.81M
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1708
9.81M
    }
1709
1710
541M
    Py_TYPE(unicode)->tp_free(unicode);
1711
541M
}
1712
1713
#ifdef Py_DEBUG
1714
static int
1715
unicode_is_singleton(PyObject *unicode)
1716
{
1717
    if (unicode == &_Py_STR(empty)) {
1718
        return 1;
1719
    }
1720
1721
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1722
    if (ascii->length == 1) {
1723
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1724
        if (ch < 256 && LATIN1(ch) == unicode) {
1725
            return 1;
1726
        }
1727
    }
1728
    return 0;
1729
}
1730
#endif
1731
1732
int
1733
_PyUnicode_IsModifiable(PyObject *unicode)
1734
66.8M
{
1735
66.8M
    assert(_PyUnicode_CHECK(unicode));
1736
66.8M
    if (!_PyObject_IsUniquelyReferenced(unicode))
1737
3.19M
        return 0;
1738
63.6M
    if (PyUnicode_HASH(unicode) != -1)
1739
0
        return 0;
1740
63.6M
    if (PyUnicode_CHECK_INTERNED(unicode))
1741
0
        return 0;
1742
63.6M
    if (!PyUnicode_CheckExact(unicode))
1743
0
        return 0;
1744
#ifdef Py_DEBUG
1745
    /* singleton refcount is greater than 1 */
1746
    assert(!unicode_is_singleton(unicode));
1747
#endif
1748
63.6M
    return 1;
1749
63.6M
}
1750
1751
static int
1752
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1753
1.94M
{
1754
1.94M
    PyObject *unicode;
1755
1.94M
    Py_ssize_t old_length;
1756
1757
1.94M
    assert(p_unicode != NULL);
1758
1.94M
    unicode = *p_unicode;
1759
1760
1.94M
    assert(unicode != NULL);
1761
1.94M
    assert(PyUnicode_Check(unicode));
1762
1.94M
    assert(0 <= length);
1763
1764
1.94M
    old_length = PyUnicode_GET_LENGTH(unicode);
1765
1.94M
    if (old_length == length)
1766
0
        return 0;
1767
1768
1.94M
    if (length == 0) {
1769
0
        PyObject *empty = _PyUnicode_GetEmpty();
1770
0
        Py_SETREF(*p_unicode, empty);
1771
0
        return 0;
1772
0
    }
1773
1774
1.94M
    if (!_PyUnicode_IsModifiable(unicode)) {
1775
0
        PyObject *copy = resize_copy(unicode, length);
1776
0
        if (copy == NULL)
1777
0
            return -1;
1778
0
        Py_SETREF(*p_unicode, copy);
1779
0
        return 0;
1780
0
    }
1781
1782
1.94M
    if (PyUnicode_IS_COMPACT(unicode)) {
1783
1.94M
        PyObject *new_unicode = _PyUnicode_ResizeCompact(unicode, length);
1784
1.94M
        if (new_unicode == NULL)
1785
0
            return -1;
1786
1.94M
        *p_unicode = new_unicode;
1787
1.94M
        return 0;
1788
1.94M
    }
1789
0
    return resize_inplace(unicode, length);
1790
1.94M
}
1791
1792
int
1793
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1794
0
{
1795
0
    PyObject *unicode;
1796
0
    if (p_unicode == NULL) {
1797
0
        PyErr_BadInternalCall();
1798
0
        return -1;
1799
0
    }
1800
0
    unicode = *p_unicode;
1801
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1802
0
    {
1803
0
        PyErr_BadInternalCall();
1804
0
        return -1;
1805
0
    }
1806
0
    return unicode_resize(p_unicode, length);
1807
0
}
1808
1809
static PyObject*
1810
get_latin1_char(Py_UCS1 ch)
1811
233M
{
1812
233M
    PyObject *o = LATIN1(ch);
1813
233M
    return o;
1814
233M
}
1815
1816
static PyObject*
1817
unicode_char(Py_UCS4 ch)
1818
267M
{
1819
267M
    PyObject *unicode;
1820
1821
267M
    assert(ch <= MAX_UNICODE);
1822
1823
267M
    if (ch < 256) {
1824
138M
        return get_latin1_char(ch);
1825
138M
    }
1826
1827
128M
    unicode = PyUnicode_New(1, ch);
1828
128M
    if (unicode == NULL)
1829
0
        return NULL;
1830
1831
128M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1832
128M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1833
119M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1834
119M
    } else {
1835
9.25M
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1836
9.25M
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1837
9.25M
    }
1838
128M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1839
128M
    return unicode;
1840
128M
}
1841
1842
1843
static inline void
1844
unicode_write_widechar(int kind, void *data,
1845
                       const wchar_t *u, Py_ssize_t size,
1846
                       Py_ssize_t num_surrogates)
1847
479k
{
1848
479k
    switch (kind) {
1849
446k
    case PyUnicode_1BYTE_KIND:
1850
446k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1851
446k
        break;
1852
1853
31.9k
    case PyUnicode_2BYTE_KIND:
1854
#if SIZEOF_WCHAR_T == 2
1855
        memcpy(data, u, size * 2);
1856
#else
1857
31.9k
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1858
31.9k
#endif
1859
31.9k
        break;
1860
1861
1.25k
    case PyUnicode_4BYTE_KIND:
1862
1.25k
    {
1863
#if SIZEOF_WCHAR_T == 2
1864
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
1865
        // surrogate pairs.
1866
        const wchar_t *end = u + size;
1867
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
1868
#  ifndef NDEBUG
1869
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
1870
#  endif
1871
        for (const wchar_t *iter = u; iter < end; ) {
1872
            assert(ucs4_out < ucs4_end);
1873
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1874
                && (iter+1) < end
1875
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1876
            {
1877
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1878
                iter += 2;
1879
            }
1880
            else {
1881
                *ucs4_out++ = *iter;
1882
                iter++;
1883
            }
1884
        }
1885
        assert(ucs4_out == ucs4_end);
1886
#else
1887
1.25k
        assert(num_surrogates == 0);
1888
1.25k
        memcpy(data, u, size * 4);
1889
1.25k
#endif
1890
1.25k
        break;
1891
0
    }
1892
0
    default:
1893
0
        Py_UNREACHABLE();
1894
479k
    }
1895
479k
}
1896
1897
1898
PyObject *
1899
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1900
845k
{
1901
845k
    PyObject *unicode;
1902
845k
    Py_UCS4 maxchar = 0;
1903
845k
    Py_ssize_t num_surrogates;
1904
1905
845k
    if (u == NULL && size != 0) {
1906
0
        PyErr_BadInternalCall();
1907
0
        return NULL;
1908
0
    }
1909
1910
845k
    if (size == -1) {
1911
1.26k
        size = wcslen(u);
1912
1.26k
    }
1913
1914
    /* If the Unicode data is known at construction time, we can apply
1915
       some optimizations which share commonly used objects. */
1916
1917
    /* Optimization for empty strings */
1918
845k
    if (size == 0)
1919
295k
        _Py_RETURN_UNICODE_EMPTY();
1920
1921
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1922
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1923
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1924
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1925
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
1926
        if (!converted) {
1927
            return NULL;
1928
        }
1929
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
1930
        PyMem_Free(converted);
1931
        return unicode;
1932
    }
1933
#endif
1934
1935
    /* Single character Unicode objects in the Latin-1 range are
1936
       shared when using this constructor */
1937
549k
    if (size == 1 && (Py_UCS4)*u < 256)
1938
70.1k
        return get_latin1_char((unsigned char)*u);
1939
1940
    /* If not empty and not single character, copy the Unicode data
1941
       into the new object */
1942
479k
    if (find_maxchar_surrogates(u, u + size,
1943
479k
                                &maxchar, &num_surrogates) == -1)
1944
0
        return NULL;
1945
1946
479k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1947
479k
    if (!unicode)
1948
0
        return NULL;
1949
1950
479k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1951
479k
                           u, size, num_surrogates);
1952
1953
479k
    return unicode_result(unicode);
1954
479k
}
1955
1956
1957
int
1958
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
1959
                              const wchar_t *str,
1960
                              Py_ssize_t size)
1961
0
{
1962
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
1963
1964
0
    if (size < 0) {
1965
0
        size = wcslen(str);
1966
0
    }
1967
1968
0
    if (size == 0) {
1969
0
        return 0;
1970
0
    }
1971
1972
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1973
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1974
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1975
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1976
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
1977
        if (!converted) {
1978
            return -1;
1979
        }
1980
1981
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
1982
        PyMem_Free(converted);
1983
        return res;
1984
    }
1985
#endif
1986
1987
0
    Py_UCS4 maxchar = 0;
1988
0
    Py_ssize_t num_surrogates;
1989
0
    if (find_maxchar_surrogates(str, str + size,
1990
0
                                &maxchar, &num_surrogates) == -1) {
1991
0
        return -1;
1992
0
    }
1993
1994
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
1995
0
        return -1;
1996
0
    }
1997
1998
0
    int kind = writer->kind;
1999
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2000
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
2001
2002
0
    writer->pos += size - num_surrogates;
2003
0
    return 0;
2004
0
}
2005
2006
2007
PyObject *
2008
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2009
9.36M
{
2010
9.36M
    if (size < 0) {
2011
0
        PyErr_SetString(PyExc_SystemError,
2012
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2013
0
        return NULL;
2014
0
    }
2015
9.36M
    if (u != NULL) {
2016
9.36M
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2017
9.36M
    }
2018
0
    if (size > 0) {
2019
0
        PyErr_SetString(PyExc_SystemError,
2020
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2021
0
        return NULL;
2022
0
    }
2023
0
    return _PyUnicode_GetEmpty();
2024
0
}
2025
2026
PyObject *
2027
PyUnicode_FromString(const char *u)
2028
22.2M
{
2029
22.2M
    size_t size = strlen(u);
2030
22.2M
    if (size > PY_SSIZE_T_MAX) {
2031
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2032
0
        return NULL;
2033
0
    }
2034
22.2M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2035
22.2M
}
2036
2037
2038
PyObject *
2039
_PyUnicode_FromId(_Py_Identifier *id)
2040
0
{
2041
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2042
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2043
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2044
2045
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2046
0
    if (index < 0) {
2047
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2048
2049
0
        PyMutex_Lock(&rt_ids->mutex);
2050
        // Check again to detect concurrent access. Another thread can have
2051
        // initialized the index while this thread waited for the lock.
2052
0
        index = _Py_atomic_load_ssize(&id->index);
2053
0
        if (index < 0) {
2054
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2055
0
            index = rt_ids->next_index;
2056
0
            rt_ids->next_index++;
2057
0
            _Py_atomic_store_ssize(&id->index, index);
2058
0
        }
2059
0
        PyMutex_Unlock(&rt_ids->mutex);
2060
0
    }
2061
0
    assert(index >= 0);
2062
2063
0
    PyObject *obj;
2064
0
    if (index < ids->size) {
2065
0
        obj = ids->array[index];
2066
0
        if (obj) {
2067
            // Return a borrowed reference
2068
0
            goto end;
2069
0
        }
2070
0
    }
2071
2072
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2073
0
                                       NULL, NULL);
2074
0
    if (!obj) {
2075
0
        goto end;
2076
0
    }
2077
0
    _PyUnicode_InternImmortal(interp, &obj);
2078
2079
0
    if (index >= ids->size) {
2080
        // Overallocate to reduce the number of realloc
2081
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2082
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2083
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2084
0
        if (new_array == NULL) {
2085
0
            PyErr_NoMemory();
2086
0
            obj = NULL;
2087
0
            goto end;
2088
0
        }
2089
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2090
0
        ids->array = new_array;
2091
0
        ids->size = new_size;
2092
0
    }
2093
2094
    // The array stores a strong reference
2095
0
    ids->array[index] = obj;
2096
2097
0
end:
2098
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2099
    // Return a borrowed reference
2100
0
    return obj;
2101
0
}
2102
2103
2104
static void
2105
unicode_clear_identifiers(struct _Py_unicode_state *state)
2106
0
{
2107
0
    struct _Py_unicode_ids *ids = &state->ids;
2108
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2109
0
        Py_XDECREF(ids->array[i]);
2110
0
    }
2111
0
    ids->size = 0;
2112
0
    PyMem_Free(ids->array);
2113
0
    ids->array = NULL;
2114
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2115
    // after Py_Finalize().
2116
0
}
2117
2118
2119
/* Internal function, doesn't check maximum character */
2120
2121
PyObject*
2122
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2123
112M
{
2124
112M
    const unsigned char *s = (const unsigned char *)buffer;
2125
112M
    PyObject *unicode;
2126
112M
    if (size == 1) {
2127
#ifdef Py_DEBUG
2128
        assert((unsigned char)s[0] < 128);
2129
#endif
2130
43.7M
        return get_latin1_char(s[0]);
2131
43.7M
    }
2132
68.9M
    unicode = PyUnicode_New(size, 127);
2133
68.9M
    if (!unicode)
2134
0
        return NULL;
2135
68.9M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2136
68.9M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2137
68.9M
    return unicode;
2138
68.9M
}
2139
2140
static Py_UCS4
2141
kind_maxchar_limit(int kind)
2142
0
{
2143
0
    switch (kind) {
2144
0
    case PyUnicode_1BYTE_KIND:
2145
0
        return 0x80;
2146
0
    case PyUnicode_2BYTE_KIND:
2147
0
        return 0x100;
2148
0
    case PyUnicode_4BYTE_KIND:
2149
0
        return 0x10000;
2150
0
    default:
2151
0
        Py_UNREACHABLE();
2152
0
    }
2153
0
}
2154
2155
static PyObject*
2156
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2157
47.6M
{
2158
47.6M
    PyObject *res;
2159
47.6M
    unsigned char max_char;
2160
2161
47.6M
    if (size == 0) {
2162
7.99M
        _Py_RETURN_UNICODE_EMPTY();
2163
7.99M
    }
2164
47.6M
    assert(size > 0);
2165
39.6M
    if (size == 1) {
2166
9.94M
        return get_latin1_char(u[0]);
2167
9.94M
    }
2168
2169
29.7M
    max_char = ucs1lib_find_max_char(u, u + size);
2170
29.7M
    res = PyUnicode_New(size, max_char);
2171
29.7M
    if (!res)
2172
0
        return NULL;
2173
29.7M
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2174
29.7M
    assert(_PyUnicode_CheckConsistency(res, 1));
2175
29.7M
    return res;
2176
29.7M
}
2177
2178
static PyObject*
2179
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2180
99.5M
{
2181
99.5M
    PyObject *res;
2182
99.5M
    Py_UCS2 max_char;
2183
2184
99.5M
    if (size == 0)
2185
13.4M
        _Py_RETURN_UNICODE_EMPTY();
2186
99.5M
    assert(size > 0);
2187
86.0M
    if (size == 1)
2188
59.6M
        return unicode_char(u[0]);
2189
2190
26.4M
    max_char = ucs2lib_find_max_char(u, u + size);
2191
26.4M
    res = PyUnicode_New(size, max_char);
2192
26.4M
    if (!res)
2193
0
        return NULL;
2194
26.4M
    if (max_char >= 256)
2195
15.8M
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2196
10.5M
    else {
2197
10.5M
        _PyUnicode_CONVERT_BYTES(
2198
10.5M
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2199
10.5M
    }
2200
26.4M
    assert(_PyUnicode_CheckConsistency(res, 1));
2201
26.4M
    return res;
2202
26.4M
}
2203
2204
static PyObject*
2205
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2206
65.3M
{
2207
65.3M
    PyObject *res;
2208
65.3M
    Py_UCS4 max_char;
2209
2210
65.3M
    if (size == 0)
2211
8.84M
        _Py_RETURN_UNICODE_EMPTY();
2212
65.3M
    assert(size > 0);
2213
56.5M
    if (size == 1)
2214
39.4M
        return unicode_char(u[0]);
2215
2216
17.0M
    max_char = ucs4lib_find_max_char(u, u + size);
2217
17.0M
    res = PyUnicode_New(size, max_char);
2218
17.0M
    if (!res)
2219
0
        return NULL;
2220
17.0M
    if (max_char < 256)
2221
12.0M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2222
17.0M
                                 PyUnicode_1BYTE_DATA(res));
2223
4.96M
    else if (max_char < 0x10000)
2224
3.43M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2225
4.96M
                                 PyUnicode_2BYTE_DATA(res));
2226
1.53M
    else
2227
1.53M
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2228
17.0M
    assert(_PyUnicode_CheckConsistency(res, 1));
2229
17.0M
    return res;
2230
17.0M
}
2231
2232
2233
int
2234
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2235
                          const Py_UCS4 *str,
2236
                          Py_ssize_t size)
2237
0
{
2238
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2239
2240
0
    if (size < 0) {
2241
0
        PyErr_SetString(PyExc_ValueError,
2242
0
                        "size must be positive");
2243
0
        return -1;
2244
0
    }
2245
2246
0
    if (size == 0) {
2247
0
        return 0;
2248
0
    }
2249
2250
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2251
2252
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2253
0
        return -1;
2254
0
    }
2255
2256
0
    int kind = writer->kind;
2257
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2258
0
    if (kind == PyUnicode_1BYTE_KIND) {
2259
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2260
0
                                 str, str + size,
2261
0
                                 data);
2262
0
    }
2263
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2264
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2265
0
                                 str, str + size,
2266
0
                                 data);
2267
0
    }
2268
0
    else {
2269
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2270
0
    }
2271
0
    writer->pos += size;
2272
2273
0
    return 0;
2274
0
}
2275
2276
2277
PyObject*
2278
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2279
157M
{
2280
157M
    if (size < 0) {
2281
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2282
0
        return NULL;
2283
0
    }
2284
157M
    switch (kind) {
2285
21.7M
    case PyUnicode_1BYTE_KIND:
2286
21.7M
        return _PyUnicode_FromUCS1(buffer, size);
2287
83.5M
    case PyUnicode_2BYTE_KIND:
2288
83.5M
        return _PyUnicode_FromUCS2(buffer, size);
2289
52.3M
    case PyUnicode_4BYTE_KIND:
2290
52.3M
        return _PyUnicode_FromUCS4(buffer, size);
2291
0
    default:
2292
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2293
0
        return NULL;
2294
157M
    }
2295
157M
}
2296
2297
Py_UCS4
2298
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2299
13.1M
{
2300
13.1M
    int kind;
2301
13.1M
    const void *startptr, *endptr;
2302
2303
13.1M
    assert(0 <= start);
2304
13.1M
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2305
13.1M
    assert(start <= end);
2306
2307
13.1M
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2308
85.1k
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2309
2310
13.0M
    if (start == end)
2311
0
        return 127;
2312
2313
13.0M
    if (PyUnicode_IS_ASCII(unicode))
2314
12.9M
        return 127;
2315
2316
31.2k
    kind = PyUnicode_KIND(unicode);
2317
31.2k
    startptr = PyUnicode_DATA(unicode);
2318
31.2k
    endptr = (char *)startptr + end * kind;
2319
31.2k
    startptr = (char *)startptr + start * kind;
2320
31.2k
    switch(kind) {
2321
1.68k
    case PyUnicode_1BYTE_KIND:
2322
1.68k
        return ucs1lib_find_max_char(startptr, endptr);
2323
8.97k
    case PyUnicode_2BYTE_KIND:
2324
8.97k
        return ucs2lib_find_max_char(startptr, endptr);
2325
20.5k
    case PyUnicode_4BYTE_KIND:
2326
20.5k
        return ucs4lib_find_max_char(startptr, endptr);
2327
0
    default:
2328
0
        Py_UNREACHABLE();
2329
31.2k
    }
2330
31.2k
}
2331
2332
/* Ensure that a string uses the most efficient storage, if it is not the
2333
   case: create a new string with of the right kind. Write NULL into *p_unicode
2334
   on error. */
2335
static void
2336
unicode_adjust_maxchar(PyObject **p_unicode)
2337
0
{
2338
0
    PyObject *unicode, *copy;
2339
0
    Py_UCS4 max_char;
2340
0
    Py_ssize_t len;
2341
0
    int kind;
2342
2343
0
    assert(p_unicode != NULL);
2344
0
    unicode = *p_unicode;
2345
0
    if (PyUnicode_IS_ASCII(unicode))
2346
0
        return;
2347
2348
0
    len = PyUnicode_GET_LENGTH(unicode);
2349
0
    kind = PyUnicode_KIND(unicode);
2350
0
    if (kind == PyUnicode_1BYTE_KIND) {
2351
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2352
0
        max_char = ucs1lib_find_max_char(u, u + len);
2353
0
        if (max_char >= 128)
2354
0
            return;
2355
0
    }
2356
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2357
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2358
0
        max_char = ucs2lib_find_max_char(u, u + len);
2359
0
        if (max_char >= 256)
2360
0
            return;
2361
0
    }
2362
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2363
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2364
0
        max_char = ucs4lib_find_max_char(u, u + len);
2365
0
        if (max_char >= 0x10000)
2366
0
            return;
2367
0
    }
2368
0
    else
2369
0
        Py_UNREACHABLE();
2370
2371
0
    copy = PyUnicode_New(len, max_char);
2372
0
    if (copy != NULL)
2373
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2374
0
    Py_DECREF(unicode);
2375
0
    *p_unicode = copy;
2376
0
}
2377
2378
PyObject*
2379
_PyUnicode_Copy(PyObject *unicode)
2380
2.97M
{
2381
2.97M
    Py_ssize_t length;
2382
2.97M
    PyObject *copy;
2383
2384
2.97M
    if (!PyUnicode_Check(unicode)) {
2385
0
        PyErr_BadInternalCall();
2386
0
        return NULL;
2387
0
    }
2388
2389
2.97M
    length = PyUnicode_GET_LENGTH(unicode);
2390
2.97M
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2391
2.97M
    if (!copy)
2392
0
        return NULL;
2393
2.97M
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2394
2395
2.97M
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2396
2.97M
              length * PyUnicode_KIND(unicode));
2397
2.97M
    assert(_PyUnicode_CheckConsistency(copy, 1));
2398
2.97M
    return copy;
2399
2.97M
}
2400
2401
2402
/* Widen Unicode objects to larger buffers. Don't write terminating null
2403
   character. Return NULL on error. */
2404
2405
static void*
2406
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2407
7.46M
{
2408
7.46M
    void *result;
2409
2410
7.46M
    assert(skind < kind);
2411
7.46M
    switch (kind) {
2412
4.83M
    case PyUnicode_2BYTE_KIND:
2413
4.83M
        result = PyMem_New(Py_UCS2, len);
2414
4.83M
        if (!result)
2415
0
            return PyErr_NoMemory();
2416
4.83M
        assert(skind == PyUnicode_1BYTE_KIND);
2417
4.83M
        _PyUnicode_CONVERT_BYTES(
2418
4.83M
            Py_UCS1, Py_UCS2,
2419
4.83M
            (const Py_UCS1 *)data,
2420
4.83M
            ((const Py_UCS1 *)data) + len,
2421
4.83M
            result);
2422
4.83M
        return result;
2423
2.62M
    case PyUnicode_4BYTE_KIND:
2424
2.62M
        result = PyMem_New(Py_UCS4, len);
2425
2.62M
        if (!result)
2426
0
            return PyErr_NoMemory();
2427
2.62M
        if (skind == PyUnicode_2BYTE_KIND) {
2428
0
            _PyUnicode_CONVERT_BYTES(
2429
0
                Py_UCS2, Py_UCS4,
2430
0
                (const Py_UCS2 *)data,
2431
0
                ((const Py_UCS2 *)data) + len,
2432
0
                result);
2433
0
        }
2434
2.62M
        else {
2435
2.62M
            assert(skind == PyUnicode_1BYTE_KIND);
2436
2.62M
            _PyUnicode_CONVERT_BYTES(
2437
2.62M
                Py_UCS1, Py_UCS4,
2438
2.62M
                (const Py_UCS1 *)data,
2439
2.62M
                ((const Py_UCS1 *)data) + len,
2440
2.62M
                result);
2441
2.62M
        }
2442
2.62M
        return result;
2443
0
    default:
2444
0
        Py_UNREACHABLE();
2445
0
        return NULL;
2446
7.46M
    }
2447
7.46M
}
2448
2449
static Py_UCS4*
2450
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2451
        int copy_null)
2452
70.4k
{
2453
70.4k
    int kind;
2454
70.4k
    const void *data;
2455
70.4k
    Py_ssize_t len, targetlen;
2456
70.4k
    kind = PyUnicode_KIND(string);
2457
70.4k
    data = PyUnicode_DATA(string);
2458
70.4k
    len = PyUnicode_GET_LENGTH(string);
2459
70.4k
    targetlen = len;
2460
70.4k
    if (copy_null)
2461
0
        targetlen++;
2462
70.4k
    if (!target) {
2463
0
        target = PyMem_New(Py_UCS4, targetlen);
2464
0
        if (!target) {
2465
0
            PyErr_NoMemory();
2466
0
            return NULL;
2467
0
        }
2468
0
    }
2469
70.4k
    else {
2470
70.4k
        if (targetsize < targetlen) {
2471
0
            PyErr_Format(PyExc_SystemError,
2472
0
                         "string is longer than the buffer");
2473
0
            if (copy_null && 0 < targetsize)
2474
0
                target[0] = 0;
2475
0
            return NULL;
2476
0
        }
2477
70.4k
    }
2478
70.4k
    if (kind == PyUnicode_1BYTE_KIND) {
2479
52.4k
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2480
52.4k
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2481
52.4k
    }
2482
18.0k
    else if (kind == PyUnicode_2BYTE_KIND) {
2483
13.6k
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2484
13.6k
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2485
13.6k
    }
2486
4.38k
    else if (kind == PyUnicode_4BYTE_KIND) {
2487
4.38k
        memcpy(target, data, len * sizeof(Py_UCS4));
2488
4.38k
    }
2489
0
    else {
2490
0
        Py_UNREACHABLE();
2491
0
    }
2492
70.4k
    if (copy_null)
2493
0
        target[len] = 0;
2494
70.4k
    return target;
2495
70.4k
}
2496
2497
Py_UCS4*
2498
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2499
                 int copy_null)
2500
70.4k
{
2501
70.4k
    if (target == NULL || targetsize < 0) {
2502
0
        PyErr_BadInternalCall();
2503
0
        return NULL;
2504
0
    }
2505
70.4k
    return as_ucs4(string, target, targetsize, copy_null);
2506
70.4k
}
2507
2508
Py_UCS4*
2509
PyUnicode_AsUCS4Copy(PyObject *string)
2510
0
{
2511
0
    return as_ucs4(string, NULL, 0, 1);
2512
0
}
2513
2514
/* maximum number of characters required for output of %jo or %jd or %p.
2515
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2516
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2517
   plus 1 for the terminal NUL. */
2518
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2519
2520
static int
2521
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2522
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2523
8.85M
{
2524
8.85M
    Py_ssize_t length, fill, arglen;
2525
8.85M
    Py_UCS4 maxchar;
2526
2527
8.85M
    length = PyUnicode_GET_LENGTH(str);
2528
8.85M
    if ((precision == -1 || precision >= length)
2529
8.85M
        && width <= length)
2530
8.85M
        return _PyUnicodeWriter_WriteStr(writer, str);
2531
2532
47
    if (precision != -1)
2533
47
        length = Py_MIN(precision, length);
2534
2535
47
    arglen = Py_MAX(length, width);
2536
47
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2537
19
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2538
28
    else
2539
28
        maxchar = writer->maxchar;
2540
2541
47
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2542
0
        return -1;
2543
2544
47
    fill = Py_MAX(width - length, 0);
2545
47
    if (fill && !(flags & F_LJUST)) {
2546
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2547
0
            return -1;
2548
0
        writer->pos += fill;
2549
0
    }
2550
2551
47
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2552
47
                                  str, 0, length);
2553
47
    writer->pos += length;
2554
2555
47
    if (fill && (flags & F_LJUST)) {
2556
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2557
0
            return -1;
2558
0
        writer->pos += fill;
2559
0
    }
2560
2561
47
    return 0;
2562
47
}
2563
2564
static int
2565
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2566
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2567
3.88M
{
2568
    /* UTF-8 */
2569
3.88M
    Py_ssize_t *pconsumed = NULL;
2570
3.88M
    Py_ssize_t length;
2571
3.88M
    if (precision == -1) {
2572
835k
        length = strlen(str);
2573
835k
    }
2574
3.04M
    else {
2575
3.04M
        length = 0;
2576
48.2M
        while (length < precision && str[length]) {
2577
45.2M
            length++;
2578
45.2M
        }
2579
3.04M
        if (length == precision) {
2580
            /* The input string is not NUL-terminated.  If it ends with an
2581
             * incomplete UTF-8 sequence, truncate the string just before it.
2582
             * Incomplete sequences in the middle and sequences which cannot
2583
             * be valid prefixes are still treated as errors and replaced
2584
             * with \xfffd. */
2585
2.25k
            pconsumed = &length;
2586
2.25k
        }
2587
3.04M
    }
2588
2589
3.88M
    if (width < 0) {
2590
3.88M
        return _PyUnicode_DecodeUTF8Writer(writer, str, length,
2591
3.88M
                                           _Py_ERROR_REPLACE, "replace", pconsumed);
2592
3.88M
    }
2593
2594
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2595
0
                                                     "replace", pconsumed);
2596
0
    if (unicode == NULL)
2597
0
        return -1;
2598
2599
0
    int res = unicode_fromformat_write_str(writer, unicode,
2600
0
                                           width, -1, flags);
2601
0
    Py_DECREF(unicode);
2602
0
    return res;
2603
0
}
2604
2605
static int
2606
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2607
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2608
0
{
2609
0
    Py_ssize_t length;
2610
0
    if (precision == -1) {
2611
0
        length = wcslen(str);
2612
0
    }
2613
0
    else {
2614
0
        length = 0;
2615
0
        while (length < precision && str[length]) {
2616
0
            length++;
2617
0
        }
2618
0
    }
2619
2620
0
    if (width < 0) {
2621
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2622
0
                                             str, length);
2623
0
    }
2624
2625
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2626
0
    if (unicode == NULL)
2627
0
        return -1;
2628
2629
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2630
0
    Py_DECREF(unicode);
2631
0
    return res;
2632
0
}
2633
2634
0
#define F_LONG 1
2635
0
#define F_LONGLONG 2
2636
230k
#define F_SIZE 3
2637
0
#define F_PTRDIFF 4
2638
0
#define F_INTMAX 5
2639
2640
static const char*
2641
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2642
                       const char *f, va_list *vargs)
2643
28.1M
{
2644
28.1M
    const char *p;
2645
28.1M
    Py_ssize_t len;
2646
28.1M
    int flags = 0;
2647
28.1M
    Py_ssize_t width;
2648
28.1M
    Py_ssize_t precision;
2649
2650
28.1M
    p = f;
2651
28.1M
    f++;
2652
28.1M
    if (*f == '%') {
2653
1.02M
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2654
0
            return NULL;
2655
1.02M
        f++;
2656
1.02M
        return f;
2657
1.02M
    }
2658
2659
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2660
    /* Flags '+', ' ' and '#' are not particularly useful.
2661
     * They are not worth the implementation and maintenance costs.
2662
     * In addition, '#' should add "0" for "o" conversions for compatibility
2663
     * with printf, but it would confuse Python users. */
2664
27.0M
    while (1) {
2665
27.0M
        switch (*f++) {
2666
0
        case '-': flags |= F_LJUST; continue;
2667
1.70k
        case '0': flags |= F_ZERO; continue;
2668
0
        case '#': flags |= F_ALT; continue;
2669
27.0M
        }
2670
27.0M
        f--;
2671
27.0M
        break;
2672
27.0M
    }
2673
2674
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2675
27.0M
    width = -1;
2676
27.0M
    if (*f == '*') {
2677
0
        width = va_arg(*vargs, int);
2678
0
        if (width < 0) {
2679
0
            flags |= F_LJUST;
2680
0
            width = -width;
2681
0
        }
2682
0
        f++;
2683
0
    }
2684
27.0M
    else if (Py_ISDIGIT((unsigned)*f)) {
2685
1.70k
        width = *f - '0';
2686
1.70k
        f++;
2687
1.70k
        while (Py_ISDIGIT((unsigned)*f)) {
2688
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2689
0
                PyErr_SetString(PyExc_ValueError,
2690
0
                                "width too big");
2691
0
                return NULL;
2692
0
            }
2693
0
            width = (width * 10) + (*f - '0');
2694
0
            f++;
2695
0
        }
2696
1.70k
    }
2697
27.0M
    precision = -1;
2698
27.0M
    if (*f == '.') {
2699
6.13M
        f++;
2700
6.13M
        if (*f == '*') {
2701
0
            precision = va_arg(*vargs, int);
2702
0
            if (precision < 0) {
2703
0
                precision = -2;
2704
0
            }
2705
0
            f++;
2706
0
        }
2707
6.13M
        else if (Py_ISDIGIT((unsigned)*f)) {
2708
6.13M
            precision = (*f - '0');
2709
6.13M
            f++;
2710
18.3M
            while (Py_ISDIGIT((unsigned)*f)) {
2711
12.2M
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2712
0
                    PyErr_SetString(PyExc_ValueError,
2713
0
                                    "precision too big");
2714
0
                    return NULL;
2715
0
                }
2716
12.2M
                precision = (precision * 10) + (*f - '0');
2717
12.2M
                f++;
2718
12.2M
            }
2719
6.13M
        }
2720
6.13M
    }
2721
2722
27.0M
    int sizemod = 0;
2723
27.0M
    if (*f == 'l') {
2724
0
        if (f[1] == 'l') {
2725
0
            sizemod = F_LONGLONG;
2726
0
            f += 2;
2727
0
        }
2728
0
        else {
2729
0
            sizemod = F_LONG;
2730
0
            ++f;
2731
0
        }
2732
0
    }
2733
27.0M
    else if (*f == 'z') {
2734
115k
        sizemod = F_SIZE;
2735
115k
        ++f;
2736
115k
    }
2737
26.9M
    else if (*f == 't') {
2738
0
        sizemod = F_PTRDIFF;
2739
0
        ++f;
2740
0
    }
2741
26.9M
    else if (*f == 'j') {
2742
0
        sizemod = F_INTMAX;
2743
0
        ++f;
2744
0
    }
2745
27.0M
    if (f[0] != '\0' && f[1] == '\0')
2746
4.78M
        writer->overallocate = 0;
2747
2748
27.0M
    switch (*f) {
2749
12.7M
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2750
12.7M
        break;
2751
1.60M
    case 'c': case 'p':
2752
1.60M
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2753
1.60M
        break;
2754
3.88M
    case 's':
2755
3.88M
    case 'V':
2756
3.88M
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2757
3.88M
        break;
2758
8.85M
    default:
2759
8.85M
        if (sizemod) goto invalid_format;
2760
8.85M
        break;
2761
27.0M
    }
2762
2763
27.0M
    switch (*f) {
2764
1.59M
    case 'c':
2765
1.59M
    {
2766
1.59M
        int ordinal = va_arg(*vargs, int);
2767
1.59M
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2768
0
            PyErr_SetString(PyExc_OverflowError,
2769
0
                            "character argument not in range(0x110000)");
2770
0
            return NULL;
2771
0
        }
2772
1.59M
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2773
0
            return NULL;
2774
1.59M
        break;
2775
1.59M
    }
2776
2777
12.7M
    case 'd': case 'i':
2778
12.7M
    case 'o': case 'u': case 'x': case 'X':
2779
12.7M
    {
2780
12.7M
        char buffer[MAX_INTMAX_CHARS];
2781
2782
        // Fill buffer using sprinf, with one of many possible format
2783
        // strings, like "%llX" for `long long` in hexadecimal.
2784
        // The type/size is in `sizemod`; the format is in `*f`.
2785
2786
        // Use macros with nested switches to keep the sprintf format strings
2787
        // as compile-time literals, avoiding warnings and maybe allowing
2788
        // optimizations.
2789
2790
        // `SPRINT` macro does one sprintf
2791
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2792
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2793
12.7M
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2794
12.7M
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2795
2796
        // One inner switch to handle all format variants
2797
12.7M
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2798
12.7M
            switch (*f) {                                                     \
2799
96
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2800
19.2k
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2801
1.32k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2802
976
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2803
12.7M
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2804
12.7M
            }
2805
2806
        // Outer switch to handle all the sizes/types
2807
12.7M
        switch (sizemod) {
2808
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2809
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2810
115k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2811
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2812
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2813
12.6M
            default:         DO_SPRINTS("", int, unsigned int); break;
2814
12.7M
        }
2815
12.7M
        #undef SPRINT
2816
12.7M
        #undef DO_SPRINTS
2817
2818
12.7M
        assert(len >= 0);
2819
2820
12.7M
        int sign = (buffer[0] == '-');
2821
12.7M
        len -= sign;
2822
2823
12.7M
        precision = Py_MAX(precision, len);
2824
12.7M
        width = Py_MAX(width, precision + sign);
2825
12.7M
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2826
1.70k
            precision = width - sign;
2827
1.70k
        }
2828
2829
12.7M
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2830
12.7M
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2831
2832
12.7M
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2833
0
            return NULL;
2834
2835
12.7M
        if (spacepad && !(flags & F_LJUST)) {
2836
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2837
0
                return NULL;
2838
0
            writer->pos += spacepad;
2839
0
        }
2840
2841
12.7M
        if (sign) {
2842
863
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2843
0
                return NULL;
2844
863
        }
2845
2846
12.7M
        if (zeropad) {
2847
652
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2848
0
                return NULL;
2849
652
            writer->pos += zeropad;
2850
652
        }
2851
2852
12.7M
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2853
0
            return NULL;
2854
2855
12.7M
        if (spacepad && (flags & F_LJUST)) {
2856
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2857
0
                return NULL;
2858
0
            writer->pos += spacepad;
2859
0
        }
2860
12.7M
        break;
2861
12.7M
    }
2862
2863
12.7M
    case 'p':
2864
2.92k
    {
2865
2.92k
        char number[MAX_INTMAX_CHARS];
2866
2867
2.92k
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2868
2.92k
        assert(len >= 0);
2869
2870
        /* %p is ill-defined:  ensure leading 0x. */
2871
2.92k
        if (number[1] == 'X')
2872
0
            number[1] = 'x';
2873
2.92k
        else if (number[1] != 'x') {
2874
0
            memmove(number + 2, number,
2875
0
                    strlen(number) + 1);
2876
0
            number[0] = '0';
2877
0
            number[1] = 'x';
2878
0
            len += 2;
2879
0
        }
2880
2881
2.92k
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2882
0
            return NULL;
2883
2.92k
        break;
2884
2.92k
    }
2885
2886
3.88M
    case 's':
2887
3.88M
    {
2888
3.88M
        if (sizemod) {
2889
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
2890
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
2891
0
                return NULL;
2892
0
        }
2893
3.88M
        else {
2894
            /* UTF-8 */
2895
3.88M
            const char *s = va_arg(*vargs, const char*);
2896
3.88M
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
2897
0
                return NULL;
2898
3.88M
        }
2899
3.88M
        break;
2900
3.88M
    }
2901
2902
4.17M
    case 'U':
2903
4.17M
    {
2904
4.17M
        PyObject *obj = va_arg(*vargs, PyObject *);
2905
4.17M
        assert(obj && _PyUnicode_CHECK(obj));
2906
2907
4.17M
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2908
0
            return NULL;
2909
4.17M
        break;
2910
4.17M
    }
2911
2912
4.17M
    case 'V':
2913
596
    {
2914
596
        PyObject *obj = va_arg(*vargs, PyObject *);
2915
596
        const char *str;
2916
596
        const wchar_t *wstr;
2917
596
        if (sizemod) {
2918
0
            wstr = va_arg(*vargs, const wchar_t*);
2919
0
        }
2920
596
        else {
2921
596
            str = va_arg(*vargs, const char *);
2922
596
        }
2923
596
        if (obj) {
2924
0
            assert(_PyUnicode_CHECK(obj));
2925
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2926
0
                return NULL;
2927
0
        }
2928
596
        else if (sizemod) {
2929
0
            assert(wstr != NULL);
2930
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
2931
0
                return NULL;
2932
0
        }
2933
596
        else {
2934
596
            assert(str != NULL);
2935
596
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
2936
0
                return NULL;
2937
596
        }
2938
596
        break;
2939
596
    }
2940
2941
1.68k
    case 'S':
2942
1.68k
    {
2943
1.68k
        PyObject *obj = va_arg(*vargs, PyObject *);
2944
1.68k
        PyObject *str;
2945
1.68k
        assert(obj);
2946
1.68k
        str = PyObject_Str(obj);
2947
1.68k
        if (!str)
2948
0
            return NULL;
2949
1.68k
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
2950
0
            Py_DECREF(str);
2951
0
            return NULL;
2952
0
        }
2953
1.68k
        Py_DECREF(str);
2954
1.68k
        break;
2955
1.68k
    }
2956
2957
3.08M
    case 'R':
2958
3.08M
    {
2959
3.08M
        PyObject *obj = va_arg(*vargs, PyObject *);
2960
3.08M
        PyObject *repr;
2961
3.08M
        assert(obj);
2962
3.08M
        repr = PyObject_Repr(obj);
2963
3.08M
        if (!repr)
2964
0
            return NULL;
2965
3.08M
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
2966
0
            Py_DECREF(repr);
2967
0
            return NULL;
2968
0
        }
2969
3.08M
        Py_DECREF(repr);
2970
3.08M
        break;
2971
3.08M
    }
2972
2973
0
    case 'A':
2974
0
    {
2975
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2976
0
        PyObject *ascii;
2977
0
        assert(obj);
2978
0
        ascii = PyObject_ASCII(obj);
2979
0
        if (!ascii)
2980
0
            return NULL;
2981
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
2982
0
            Py_DECREF(ascii);
2983
0
            return NULL;
2984
0
        }
2985
0
        Py_DECREF(ascii);
2986
0
        break;
2987
0
    }
2988
2989
1.59M
    case 'T':
2990
1.59M
    {
2991
1.59M
        PyObject *obj = va_arg(*vargs, PyObject *);
2992
1.59M
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
2993
2994
1.59M
        PyObject *type_name;
2995
1.59M
        if (flags & F_ALT) {
2996
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
2997
0
        }
2998
1.59M
        else {
2999
1.59M
            type_name = PyType_GetFullyQualifiedName(type);
3000
1.59M
        }
3001
1.59M
        Py_DECREF(type);
3002
1.59M
        if (!type_name) {
3003
0
            return NULL;
3004
0
        }
3005
3006
1.59M
        if (unicode_fromformat_write_str(writer, type_name,
3007
1.59M
                                         width, precision, flags) == -1) {
3008
0
            Py_DECREF(type_name);
3009
0
            return NULL;
3010
0
        }
3011
1.59M
        Py_DECREF(type_name);
3012
1.59M
        break;
3013
1.59M
    }
3014
3015
0
    case 'N':
3016
0
    {
3017
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3018
0
        assert(type_raw != NULL);
3019
3020
0
        if (!PyType_Check(type_raw)) {
3021
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3022
0
            return NULL;
3023
0
        }
3024
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3025
3026
0
        PyObject *type_name;
3027
0
        if (flags & F_ALT) {
3028
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3029
0
        }
3030
0
        else {
3031
0
            type_name = PyType_GetFullyQualifiedName(type);
3032
0
        }
3033
0
        if (!type_name) {
3034
0
            return NULL;
3035
0
        }
3036
0
        if (unicode_fromformat_write_str(writer, type_name,
3037
0
                                         width, precision, flags) == -1) {
3038
0
            Py_DECREF(type_name);
3039
0
            return NULL;
3040
0
        }
3041
0
        Py_DECREF(type_name);
3042
0
        break;
3043
0
    }
3044
3045
0
    default:
3046
0
    invalid_format:
3047
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3048
0
        return NULL;
3049
27.0M
    }
3050
3051
27.0M
    f++;
3052
27.0M
    return f;
3053
27.0M
}
3054
3055
static int
3056
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3057
13.3M
{
3058
13.3M
    Py_ssize_t len = strlen(format);
3059
13.3M
    writer->min_length += len + 100;
3060
13.3M
    writer->overallocate = 1;
3061
3062
    // Copy varags to be able to pass a reference to a subfunction.
3063
13.3M
    va_list vargs2;
3064
13.3M
    va_copy(vargs2, vargs);
3065
3066
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3067
    // to be encoded to ASCII.
3068
13.3M
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3069
13.3M
    if (!is_ascii) {
3070
0
        Py_ssize_t i;
3071
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3072
0
        PyErr_Format(PyExc_ValueError,
3073
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3074
0
            "string, got a non-ASCII byte: 0x%02x",
3075
0
            (unsigned char)format[i]);
3076
0
        goto fail;
3077
0
    }
3078
3079
75.3M
    for (const char *f = format; *f; ) {
3080
61.9M
        if (*f == '%') {
3081
28.1M
            f = unicode_fromformat_arg(writer, f, &vargs2);
3082
28.1M
            if (f == NULL)
3083
0
                goto fail;
3084
28.1M
        }
3085
33.8M
        else {
3086
33.8M
            const char *p = strchr(f, '%');
3087
33.8M
            if (p != NULL) {
3088
25.2M
                len = p - f;
3089
25.2M
            }
3090
8.61M
            else {
3091
8.61M
                len = strlen(f);
3092
8.61M
                writer->overallocate = 0;
3093
8.61M
            }
3094
3095
33.8M
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3096
0
                goto fail;
3097
0
            }
3098
33.8M
            f += len;
3099
33.8M
        }
3100
61.9M
    }
3101
13.3M
    va_end(vargs2);
3102
13.3M
    return 0;
3103
3104
0
  fail:
3105
0
    va_end(vargs2);
3106
0
    return -1;
3107
13.3M
}
3108
3109
PyObject *
3110
PyUnicode_FromFormatV(const char *format, va_list vargs)
3111
13.3M
{
3112
13.3M
    _PyUnicodeWriter writer;
3113
13.3M
    _PyUnicodeWriter_Init(&writer);
3114
3115
13.3M
    if (unicode_from_format(&writer, format, vargs) < 0) {
3116
0
        _PyUnicodeWriter_Dealloc(&writer);
3117
0
        return NULL;
3118
0
    }
3119
13.3M
    return _PyUnicodeWriter_Finish(&writer);
3120
13.3M
}
3121
3122
PyObject *
3123
PyUnicode_FromFormat(const char *format, ...)
3124
619k
{
3125
619k
    PyObject* ret;
3126
619k
    va_list vargs;
3127
3128
619k
    va_start(vargs, format);
3129
619k
    ret = PyUnicode_FromFormatV(format, vargs);
3130
619k
    va_end(vargs);
3131
619k
    return ret;
3132
619k
}
3133
3134
int
3135
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3136
0
{
3137
0
    va_list vargs;
3138
0
    va_start(vargs, format);
3139
0
    int res = _PyUnicodeWriter_FormatV(writer, format, vargs);
3140
0
    va_end(vargs);
3141
0
    return res;
3142
0
}
3143
3144
int
3145
_PyUnicodeWriter_FormatV(PyUnicodeWriter *writer, const char *format,
3146
                         va_list vargs)
3147
0
{
3148
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3149
0
    Py_ssize_t old_pos = _writer->pos;
3150
3151
0
    int res = unicode_from_format(_writer, format, vargs);
3152
3153
0
    if (res < 0) {
3154
0
        _writer->pos = old_pos;
3155
0
    }
3156
0
    return res;
3157
0
}
3158
3159
static Py_ssize_t
3160
unicode_get_widechar_size(PyObject *unicode)
3161
195k
{
3162
195k
    Py_ssize_t res;
3163
3164
195k
    assert(unicode != NULL);
3165
195k
    assert(_PyUnicode_CHECK(unicode));
3166
3167
195k
    res = _PyUnicode_LENGTH(unicode);
3168
#if SIZEOF_WCHAR_T == 2
3169
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3170
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3171
        const Py_UCS4 *end = s + res;
3172
        for (; s < end; ++s) {
3173
            if (*s > 0xFFFF) {
3174
                ++res;
3175
            }
3176
        }
3177
    }
3178
#endif
3179
195k
    return res;
3180
195k
}
3181
3182
static void
3183
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3184
195k
{
3185
195k
    assert(unicode != NULL);
3186
195k
    assert(_PyUnicode_CHECK(unicode));
3187
3188
195k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3189
1.25k
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3190
1.25k
        return;
3191
1.25k
    }
3192
3193
194k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3194
162k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3195
4.47M
        for (; size--; ++s, ++w) {
3196
4.30M
            *w = *s;
3197
4.30M
        }
3198
162k
    }
3199
31.9k
    else {
3200
31.9k
#if SIZEOF_WCHAR_T == 4
3201
31.9k
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3202
31.9k
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3203
1.95M
        for (; size--; ++s, ++w) {
3204
1.92M
            *w = *s;
3205
1.92M
        }
3206
#else
3207
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3208
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3209
        for (; size--; ++s, ++w) {
3210
            Py_UCS4 ch = *s;
3211
            if (ch > 0xFFFF) {
3212
                assert(ch <= MAX_UNICODE);
3213
                /* encode surrogate pair in this case */
3214
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3215
                if (!size--)
3216
                    break;
3217
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3218
            }
3219
            else {
3220
                *w = ch;
3221
            }
3222
        }
3223
#endif
3224
31.9k
    }
3225
194k
}
3226
3227
#ifdef HAVE_WCHAR_H
3228
3229
/* Convert a Unicode object to a wide character string.
3230
3231
   - If w is NULL: return the number of wide characters (including the null
3232
     character) required to convert the unicode object. Ignore size argument.
3233
3234
   - Otherwise: return the number of wide characters (excluding the null
3235
     character) written into w. Write at most size wide characters (including
3236
     the null character). */
3237
Py_ssize_t
3238
PyUnicode_AsWideChar(PyObject *unicode,
3239
                     wchar_t *w,
3240
                     Py_ssize_t size)
3241
1.42k
{
3242
1.42k
    Py_ssize_t res;
3243
3244
1.42k
    if (unicode == NULL) {
3245
0
        PyErr_BadInternalCall();
3246
0
        return -1;
3247
0
    }
3248
1.42k
    if (!PyUnicode_Check(unicode)) {
3249
0
        PyErr_BadArgument();
3250
0
        return -1;
3251
0
    }
3252
3253
1.42k
    res = unicode_get_widechar_size(unicode);
3254
1.42k
    if (w == NULL) {
3255
0
        return res + 1;
3256
0
    }
3257
3258
1.42k
    if (size > res) {
3259
1.42k
        size = res + 1;
3260
1.42k
    }
3261
0
    else {
3262
0
        res = size;
3263
0
    }
3264
1.42k
    unicode_copy_as_widechar(unicode, w, size);
3265
3266
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3267
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3268
       non-Unicode locales and hence needs conversion first. */
3269
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3270
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3271
            return -1;
3272
        }
3273
    }
3274
#endif
3275
3276
1.42k
    return res;
3277
1.42k
}
3278
3279
wchar_t*
3280
PyUnicode_AsWideCharString(PyObject *unicode,
3281
                           Py_ssize_t *size)
3282
194k
{
3283
194k
    wchar_t *buffer;
3284
194k
    Py_ssize_t buflen;
3285
3286
194k
    if (unicode == NULL) {
3287
0
        PyErr_BadInternalCall();
3288
0
        return NULL;
3289
0
    }
3290
194k
    if (!PyUnicode_Check(unicode)) {
3291
0
        PyErr_BadArgument();
3292
0
        return NULL;
3293
0
    }
3294
3295
194k
    buflen = unicode_get_widechar_size(unicode);
3296
194k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3297
194k
    if (buffer == NULL) {
3298
0
        PyErr_NoMemory();
3299
0
        return NULL;
3300
0
    }
3301
194k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3302
3303
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3304
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3305
       non-Unicode locales and hence needs conversion first. */
3306
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3307
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3308
            return NULL;
3309
        }
3310
    }
3311
#endif
3312
3313
194k
    if (size != NULL) {
3314
193k
        *size = buflen;
3315
193k
    }
3316
1.08k
    else if (wcslen(buffer) != (size_t)buflen) {
3317
0
        PyMem_Free(buffer);
3318
0
        PyErr_SetString(PyExc_ValueError,
3319
0
                        "embedded null character");
3320
0
        return NULL;
3321
0
    }
3322
194k
    return buffer;
3323
194k
}
3324
3325
#endif /* HAVE_WCHAR_H */
3326
3327
int
3328
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3329
0
{
3330
0
    wchar_t **p = (wchar_t **)ptr;
3331
0
    if (obj == NULL) {
3332
0
        PyMem_Free(*p);
3333
0
        *p = NULL;
3334
0
        return 1;
3335
0
    }
3336
0
    if (PyUnicode_Check(obj)) {
3337
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3338
0
        if (*p == NULL) {
3339
0
            return 0;
3340
0
        }
3341
0
        return Py_CLEANUP_SUPPORTED;
3342
0
    }
3343
0
    PyErr_Format(PyExc_TypeError,
3344
0
                 "argument must be str, not %.50s",
3345
0
                 Py_TYPE(obj)->tp_name);
3346
0
    return 0;
3347
0
}
3348
3349
int
3350
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3351
0
{
3352
0
    wchar_t **p = (wchar_t **)ptr;
3353
0
    if (obj == NULL) {
3354
0
        PyMem_Free(*p);
3355
0
        *p = NULL;
3356
0
        return 1;
3357
0
    }
3358
0
    if (obj == Py_None) {
3359
0
        *p = NULL;
3360
0
        return 1;
3361
0
    }
3362
0
    if (PyUnicode_Check(obj)) {
3363
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3364
0
        if (*p == NULL) {
3365
0
            return 0;
3366
0
        }
3367
0
        return Py_CLEANUP_SUPPORTED;
3368
0
    }
3369
0
    PyErr_Format(PyExc_TypeError,
3370
0
                 "argument must be str or None, not %.50s",
3371
0
                 Py_TYPE(obj)->tp_name);
3372
0
    return 0;
3373
0
}
3374
3375
PyObject *
3376
PyUnicode_FromOrdinal(int ordinal)
3377
5.80M
{
3378
5.80M
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3379
34
        PyErr_SetString(PyExc_ValueError,
3380
34
                        "chr() arg not in range(0x110000)");
3381
34
        return NULL;
3382
34
    }
3383
3384
5.80M
    return unicode_char((Py_UCS4)ordinal);
3385
5.80M
}
3386
3387
PyObject *
3388
PyUnicode_FromObject(PyObject *obj)
3389
2.14M
{
3390
    /* XXX Perhaps we should make this API an alias of
3391
       PyObject_Str() instead ?! */
3392
2.14M
    if (PyUnicode_CheckExact(obj)) {
3393
2.14M
        return Py_NewRef(obj);
3394
2.14M
    }
3395
0
    if (PyUnicode_Check(obj)) {
3396
        /* For a Unicode subtype that's not a Unicode object,
3397
           return a true Unicode object with the same data. */
3398
0
        return _PyUnicode_Copy(obj);
3399
0
    }
3400
0
    PyErr_Format(PyExc_TypeError,
3401
0
                 "Can't convert '%.100s' object to str implicitly",
3402
0
                 Py_TYPE(obj)->tp_name);
3403
0
    return NULL;
3404
0
}
3405
3406
PyObject *
3407
PyUnicode_FromEncodedObject(PyObject *obj,
3408
                            const char *encoding,
3409
                            const char *errors)
3410
23.3M
{
3411
23.3M
    Py_buffer buffer;
3412
23.3M
    PyObject *v;
3413
3414
23.3M
    if (obj == NULL) {
3415
0
        PyErr_BadInternalCall();
3416
0
        return NULL;
3417
0
    }
3418
3419
    /* Decoding bytes objects is the most common case and should be fast */
3420
23.3M
    if (PyBytes_Check(obj)) {
3421
22.8M
        if (PyBytes_GET_SIZE(obj) == 0) {
3422
2.51M
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3423
0
                return NULL;
3424
0
            }
3425
2.51M
            _Py_RETURN_UNICODE_EMPTY();
3426
2.51M
        }
3427
20.3M
        return PyUnicode_Decode(
3428
20.3M
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3429
20.3M
                encoding, errors);
3430
22.8M
    }
3431
3432
471k
    if (PyUnicode_Check(obj)) {
3433
0
        PyErr_SetString(PyExc_TypeError,
3434
0
                        "decoding str is not supported");
3435
0
        return NULL;
3436
0
    }
3437
3438
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3439
471k
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3440
0
        PyErr_Format(PyExc_TypeError,
3441
0
                     "decoding to str: need a bytes-like object, %.80s found",
3442
0
                     Py_TYPE(obj)->tp_name);
3443
0
        return NULL;
3444
0
    }
3445
3446
471k
    if (buffer.len == 0) {
3447
0
        PyBuffer_Release(&buffer);
3448
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3449
0
            return NULL;
3450
0
        }
3451
0
        _Py_RETURN_UNICODE_EMPTY();
3452
0
    }
3453
3454
471k
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3455
471k
    PyBuffer_Release(&buffer);
3456
471k
    return v;
3457
471k
}
3458
3459
/* Normalize an encoding name like encodings.normalize_encoding()
3460
   but allow to convert to lowercase if *to_lower* is true.
3461
   Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
3462
int
3463
_Py_normalize_encoding(const char *encoding,
3464
                       char *lower,
3465
                       size_t lower_len,
3466
                       int to_lower)
3467
27.5M
{
3468
27.5M
    const char *e;
3469
27.5M
    char *l;
3470
27.5M
    char *l_end;
3471
27.5M
    int punct;
3472
3473
27.5M
    assert(encoding != NULL);
3474
3475
27.5M
    e = encoding;
3476
27.5M
    l = lower;
3477
27.5M
    l_end = &lower[lower_len - 1];
3478
27.5M
    punct = 0;
3479
180M
    while (1) {
3480
180M
        char c = *e;
3481
180M
        if (c == 0) {
3482
27.0M
            break;
3483
27.0M
        }
3484
3485
153M
        if (Py_ISALNUM(c) || c == '.') {
3486
137M
            if (punct && l != lower) {
3487
12.1M
                if (l == l_end) {
3488
1.41k
                    return 0;
3489
1.41k
                }
3490
12.1M
                *l++ = '_';
3491
12.1M
            }
3492
137M
            punct = 0;
3493
3494
137M
            if (l == l_end) {
3495
567k
                return 0;
3496
567k
            }
3497
137M
            *l++ = to_lower ? Py_TOLOWER(c) : c;
3498
137M
        }
3499
16.2M
        else {
3500
16.2M
            punct = 1;
3501
16.2M
        }
3502
3503
153M
        e++;
3504
153M
    }
3505
27.0M
    *l = '\0';
3506
27.0M
    return 1;
3507
27.5M
}
3508
3509
PyObject *
3510
PyUnicode_Decode(const char *s,
3511
                 Py_ssize_t size,
3512
                 const char *encoding,
3513
                 const char *errors)
3514
20.8M
{
3515
20.8M
    PyObject *buffer = NULL, *unicode;
3516
20.8M
    Py_buffer info;
3517
20.8M
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3518
3519
20.8M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3520
0
        return NULL;
3521
0
    }
3522
3523
20.8M
    if (size == 0) {
3524
0
        _Py_RETURN_UNICODE_EMPTY();
3525
0
    }
3526
3527
20.8M
    if (encoding == NULL) {
3528
43.9k
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3529
43.9k
    }
3530
3531
    /* Shortcuts for common default encodings */
3532
20.8M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3533
20.8M
        char *lower = buflower;
3534
3535
        /* Fast paths */
3536
20.8M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3537
3.42M
            lower += 3;
3538
3.42M
            if (*lower == '_') {
3539
                /* Match "utf8" and "utf_8" */
3540
3.42M
                lower++;
3541
3.42M
            }
3542
3543
3.42M
            if (lower[0] == '8' && lower[1] == 0) {
3544
3.42M
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3545
3.42M
            }
3546
1.18k
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3547
177
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3548
177
            }
3549
1.00k
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3550
163
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3551
163
            }
3552
3.42M
        }
3553
17.4M
        else {
3554
17.4M
            if (strcmp(lower, "ascii") == 0
3555
13.2M
                || strcmp(lower, "us_ascii") == 0) {
3556
13.2M
                return PyUnicode_DecodeASCII(s, size, errors);
3557
13.2M
            }
3558
    #ifdef MS_WINDOWS
3559
            else if (strcmp(lower, "mbcs") == 0) {
3560
                return PyUnicode_DecodeMBCS(s, size, errors);
3561
            }
3562
    #endif
3563
4.17M
            else if (strcmp(lower, "latin1") == 0
3564
4.17M
                     || strcmp(lower, "latin_1") == 0
3565
1.26M
                     || strcmp(lower, "iso_8859_1") == 0
3566
2.93M
                     || strcmp(lower, "iso8859_1") == 0) {
3567
2.93M
                return PyUnicode_DecodeLatin1(s, size, errors);
3568
2.93M
            }
3569
17.4M
        }
3570
20.8M
    }
3571
3572
    /* Decode via the codec registry */
3573
1.24M
    buffer = NULL;
3574
1.24M
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3575
0
        goto onError;
3576
1.24M
    buffer = PyMemoryView_FromBuffer(&info);
3577
1.24M
    if (buffer == NULL)
3578
0
        goto onError;
3579
1.24M
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3580
1.24M
    if (unicode == NULL)
3581
86.9k
        goto onError;
3582
1.15M
    if (!PyUnicode_Check(unicode)) {
3583
0
        PyErr_Format(PyExc_TypeError,
3584
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3585
0
                     "use codecs.decode() to decode to arbitrary types",
3586
0
                     encoding,
3587
0
                     Py_TYPE(unicode)->tp_name);
3588
0
        Py_DECREF(unicode);
3589
0
        goto onError;
3590
0
    }
3591
1.15M
    Py_DECREF(buffer);
3592
1.15M
    return unicode_result(unicode);
3593
3594
86.9k
  onError:
3595
86.9k
    Py_XDECREF(buffer);
3596
86.9k
    return NULL;
3597
1.15M
}
3598
3599
PyAPI_FUNC(PyObject *)
3600
PyUnicode_AsDecodedObject(PyObject *unicode,
3601
                          const char *encoding,
3602
                          const char *errors)
3603
0
{
3604
0
    if (!PyUnicode_Check(unicode)) {
3605
0
        PyErr_BadArgument();
3606
0
        return NULL;
3607
0
    }
3608
3609
0
    if (encoding == NULL)
3610
0
        encoding = PyUnicode_GetDefaultEncoding();
3611
3612
    /* Decode via the codec registry */
3613
0
    return PyCodec_Decode(unicode, encoding, errors);
3614
0
}
3615
3616
PyAPI_FUNC(PyObject *)
3617
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3618
                           const char *encoding,
3619
                           const char *errors)
3620
0
{
3621
0
    PyObject *v;
3622
3623
0
    if (!PyUnicode_Check(unicode)) {
3624
0
        PyErr_BadArgument();
3625
0
        goto onError;
3626
0
    }
3627
3628
0
    if (encoding == NULL)
3629
0
        encoding = PyUnicode_GetDefaultEncoding();
3630
3631
    /* Decode via the codec registry */
3632
0
    v = PyCodec_Decode(unicode, encoding, errors);
3633
0
    if (v == NULL)
3634
0
        goto onError;
3635
0
    if (!PyUnicode_Check(v)) {
3636
0
        PyErr_Format(PyExc_TypeError,
3637
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3638
0
                     "use codecs.decode() to decode to arbitrary types",
3639
0
                     encoding,
3640
0
                     Py_TYPE(unicode)->tp_name);
3641
0
        Py_DECREF(v);
3642
0
        goto onError;
3643
0
    }
3644
0
    return unicode_result(v);
3645
3646
0
  onError:
3647
0
    return NULL;
3648
0
}
3649
3650
PyAPI_FUNC(PyObject *)
3651
PyUnicode_AsEncodedObject(PyObject *unicode,
3652
                          const char *encoding,
3653
                          const char *errors)
3654
0
{
3655
0
    PyObject *v;
3656
3657
0
    if (!PyUnicode_Check(unicode)) {
3658
0
        PyErr_BadArgument();
3659
0
        goto onError;
3660
0
    }
3661
3662
0
    if (encoding == NULL)
3663
0
        encoding = PyUnicode_GetDefaultEncoding();
3664
3665
    /* Encode via the codec registry */
3666
0
    v = PyCodec_Encode(unicode, encoding, errors);
3667
0
    if (v == NULL)
3668
0
        goto onError;
3669
0
    return v;
3670
3671
0
  onError:
3672
0
    return NULL;
3673
0
}
3674
3675
3676
static PyObject *
3677
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3678
                      int current_locale)
3679
0
{
3680
0
    Py_ssize_t wlen;
3681
0
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3682
0
    if (wstr == NULL) {
3683
0
        return NULL;
3684
0
    }
3685
3686
0
    if ((size_t)wlen != wcslen(wstr)) {
3687
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3688
0
        PyMem_Free(wstr);
3689
0
        return NULL;
3690
0
    }
3691
3692
0
    char *str;
3693
0
    size_t error_pos;
3694
0
    const char *reason;
3695
0
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3696
0
                                 current_locale, error_handler);
3697
0
    PyMem_Free(wstr);
3698
3699
0
    if (res != 0) {
3700
0
        if (res == -2) {
3701
0
            PyObject *exc;
3702
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3703
0
                    "locale", unicode,
3704
0
                    (Py_ssize_t)error_pos,
3705
0
                    (Py_ssize_t)(error_pos+1),
3706
0
                    reason);
3707
0
            if (exc != NULL) {
3708
0
                PyCodec_StrictErrors(exc);
3709
0
                Py_DECREF(exc);
3710
0
            }
3711
0
        }
3712
0
        else if (res == -3) {
3713
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3714
0
        }
3715
0
        else {
3716
0
            PyErr_NoMemory();
3717
0
        }
3718
0
        return NULL;
3719
0
    }
3720
3721
0
    PyObject *bytes = PyBytes_FromString(str);
3722
0
    PyMem_RawFree(str);
3723
0
    return bytes;
3724
0
}
3725
3726
PyObject *
3727
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3728
0
{
3729
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3730
0
    return unicode_encode_locale(unicode, error_handler, 1);
3731
0
}
3732
3733
PyObject *
3734
PyUnicode_EncodeFSDefault(PyObject *unicode)
3735
904k
{
3736
904k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3737
904k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3738
904k
    if (fs_codec->utf8) {
3739
904k
        return unicode_encode_utf8(unicode,
3740
904k
                                   fs_codec->error_handler,
3741
904k
                                   fs_codec->errors);
3742
904k
    }
3743
0
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3744
0
    else if (fs_codec->encoding) {
3745
0
        return PyUnicode_AsEncodedString(unicode,
3746
0
                                         fs_codec->encoding,
3747
0
                                         fs_codec->errors);
3748
0
    }
3749
0
#endif
3750
0
    else {
3751
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3752
           machinery is not ready and so cannot be used:
3753
           use wcstombs() in this case. */
3754
0
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3755
0
        const wchar_t *filesystem_errors = config->filesystem_errors;
3756
0
        assert(filesystem_errors != NULL);
3757
0
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3758
0
        assert(errors != _Py_ERROR_UNKNOWN);
3759
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3760
        return unicode_encode_utf8(unicode, errors, NULL);
3761
#else
3762
0
        return unicode_encode_locale(unicode, errors, 0);
3763
0
#endif
3764
0
    }
3765
904k
}
3766
3767
PyObject *
3768
PyUnicode_AsEncodedString(PyObject *unicode,
3769
                          const char *encoding,
3770
                          const char *errors)
3771
19.3M
{
3772
19.3M
    PyObject *v;
3773
19.3M
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3774
3775
19.3M
    if (!PyUnicode_Check(unicode)) {
3776
0
        PyErr_BadArgument();
3777
0
        return NULL;
3778
0
    }
3779
3780
19.3M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3781
0
        return NULL;
3782
0
    }
3783
3784
19.3M
    if (encoding == NULL) {
3785
12.5M
        return _PyUnicode_AsUTF8String(unicode, errors);
3786
12.5M
    }
3787
3788
    /* Shortcuts for common default encodings */
3789
6.73M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3790
6.16M
        char *lower = buflower;
3791
3792
        /* Fast paths */
3793
6.16M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3794
5.04M
            lower += 3;
3795
5.04M
            if (*lower == '_') {
3796
                /* Match "utf8" and "utf_8" */
3797
5.04M
                lower++;
3798
5.04M
            }
3799
3800
5.04M
            if (lower[0] == '8' && lower[1] == 0) {
3801
5.03M
                return _PyUnicode_AsUTF8String(unicode, errors);
3802
5.03M
            }
3803
8.26k
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3804
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3805
0
            }
3806
8.26k
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3807
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3808
0
            }
3809
5.04M
        }
3810
1.12M
        else {
3811
1.12M
            if (strcmp(lower, "ascii") == 0
3812
855k
                || strcmp(lower, "us_ascii") == 0) {
3813
855k
                return _PyUnicode_AsASCIIString(unicode, errors);
3814
855k
            }
3815
#ifdef MS_WINDOWS
3816
            else if (strcmp(lower, "mbcs") == 0) {
3817
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3818
            }
3819
#endif
3820
269k
            else if (strcmp(lower, "latin1") == 0 ||
3821
269k
                     strcmp(lower, "latin_1") == 0 ||
3822
269k
                     strcmp(lower, "iso_8859_1") == 0 ||
3823
269k
                     strcmp(lower, "iso8859_1") == 0) {
3824
10
                return _PyUnicode_AsLatin1String(unicode, errors);
3825
10
            }
3826
1.12M
        }
3827
6.16M
    }
3828
3829
    /* Encode via the codec registry */
3830
842k
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3831
842k
    if (v == NULL)
3832
0
        return NULL;
3833
3834
    /* The normal path */
3835
842k
    if (PyBytes_Check(v))
3836
842k
        return v;
3837
3838
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3839
0
    if (PyByteArray_Check(v)) {
3840
0
        int error;
3841
0
        PyObject *b;
3842
3843
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3844
0
            "encoder %s returned bytearray instead of bytes; "
3845
0
            "use codecs.encode() to encode to arbitrary types",
3846
0
            encoding);
3847
0
        if (error) {
3848
0
            Py_DECREF(v);
3849
0
            return NULL;
3850
0
        }
3851
3852
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3853
0
                                      PyByteArray_GET_SIZE(v));
3854
0
        Py_DECREF(v);
3855
0
        return b;
3856
0
    }
3857
3858
0
    PyErr_Format(PyExc_TypeError,
3859
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3860
0
                 "use codecs.encode() to encode to arbitrary types",
3861
0
                 encoding,
3862
0
                 Py_TYPE(v)->tp_name);
3863
0
    Py_DECREF(v);
3864
0
    return NULL;
3865
0
}
3866
3867
PyAPI_FUNC(PyObject *)
3868
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3869
                           const char *encoding,
3870
                           const char *errors)
3871
0
{
3872
0
    PyObject *v;
3873
3874
0
    if (!PyUnicode_Check(unicode)) {
3875
0
        PyErr_BadArgument();
3876
0
        goto onError;
3877
0
    }
3878
3879
0
    if (encoding == NULL)
3880
0
        encoding = PyUnicode_GetDefaultEncoding();
3881
3882
    /* Encode via the codec registry */
3883
0
    v = PyCodec_Encode(unicode, encoding, errors);
3884
0
    if (v == NULL)
3885
0
        goto onError;
3886
0
    if (!PyUnicode_Check(v)) {
3887
0
        PyErr_Format(PyExc_TypeError,
3888
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3889
0
                     "use codecs.encode() to encode to arbitrary types",
3890
0
                     encoding,
3891
0
                     Py_TYPE(v)->tp_name);
3892
0
        Py_DECREF(v);
3893
0
        goto onError;
3894
0
    }
3895
0
    return v;
3896
3897
0
  onError:
3898
0
    return NULL;
3899
0
}
3900
3901
static PyObject*
3902
unicode_decode_locale(const char *str, Py_ssize_t len,
3903
                      _Py_error_handler errors, int current_locale)
3904
328k
{
3905
328k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3906
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3907
0
        return NULL;
3908
0
    }
3909
3910
328k
    wchar_t *wstr;
3911
328k
    size_t wlen;
3912
328k
    const char *reason;
3913
328k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3914
328k
                                 current_locale, errors);
3915
328k
    if (res != 0) {
3916
0
        if (res == -2) {
3917
0
            PyObject *exc;
3918
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3919
0
                                        "locale", str, len,
3920
0
                                        (Py_ssize_t)wlen,
3921
0
                                        (Py_ssize_t)(wlen + 1),
3922
0
                                        reason);
3923
0
            if (exc != NULL) {
3924
0
                PyCodec_StrictErrors(exc);
3925
0
                Py_DECREF(exc);
3926
0
            }
3927
0
        }
3928
0
        else if (res == -3) {
3929
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3930
0
        }
3931
0
        else {
3932
0
            PyErr_NoMemory();
3933
0
        }
3934
0
        return NULL;
3935
0
    }
3936
3937
328k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3938
328k
    PyMem_RawFree(wstr);
3939
328k
    return unicode;
3940
328k
}
3941
3942
PyObject*
3943
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3944
                              const char *errors)
3945
0
{
3946
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3947
0
    return unicode_decode_locale(str, len, error_handler, 1);
3948
0
}
3949
3950
PyObject*
3951
PyUnicode_DecodeLocale(const char *str, const char *errors)
3952
328k
{
3953
328k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
3954
328k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3955
328k
    return unicode_decode_locale(str, size, error_handler, 1);
3956
328k
}
3957
3958
3959
PyObject*
3960
330
PyUnicode_DecodeFSDefault(const char *s) {
3961
330
    Py_ssize_t size = (Py_ssize_t)strlen(s);
3962
330
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3963
330
}
3964
3965
PyObject*
3966
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3967
149k
{
3968
149k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3969
149k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3970
149k
    if (fs_codec->utf8) {
3971
149k
        return unicode_decode_utf8(s, size,
3972
149k
                                   fs_codec->error_handler,
3973
149k
                                   fs_codec->errors,
3974
149k
                                   NULL);
3975
149k
    }
3976
36
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3977
36
    else if (fs_codec->encoding) {
3978
0
        return PyUnicode_Decode(s, size,
3979
0
                                fs_codec->encoding,
3980
0
                                fs_codec->errors);
3981
0
    }
3982
36
#endif
3983
36
    else {
3984
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3985
           machinery is not ready and so cannot be used:
3986
           use mbstowcs() in this case. */
3987
36
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3988
36
        const wchar_t *filesystem_errors = config->filesystem_errors;
3989
36
        assert(filesystem_errors != NULL);
3990
36
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3991
36
        assert(errors != _Py_ERROR_UNKNOWN);
3992
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3993
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
3994
#else
3995
36
        return unicode_decode_locale(s, size, errors, 0);
3996
36
#endif
3997
36
    }
3998
149k
}
3999
4000
4001
int
4002
PyUnicode_FSConverter(PyObject* arg, void* addr)
4003
235k
{
4004
235k
    PyObject *path = NULL;
4005
235k
    PyObject *output = NULL;
4006
235k
    Py_ssize_t size;
4007
235k
    const char *data;
4008
235k
    if (arg == NULL) {
4009
0
        Py_DECREF(*(PyObject**)addr);
4010
0
        *(PyObject**)addr = NULL;
4011
0
        return 1;
4012
0
    }
4013
235k
    path = PyOS_FSPath(arg);
4014
235k
    if (path == NULL) {
4015
0
        return 0;
4016
0
    }
4017
235k
    if (PyBytes_Check(path)) {
4018
0
        output = path;
4019
0
    }
4020
235k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4021
235k
        output = PyUnicode_EncodeFSDefault(path);
4022
235k
        Py_DECREF(path);
4023
235k
        if (!output) {
4024
0
            return 0;
4025
0
        }
4026
235k
        assert(PyBytes_Check(output));
4027
235k
    }
4028
4029
235k
    size = PyBytes_GET_SIZE(output);
4030
235k
    data = PyBytes_AS_STRING(output);
4031
235k
    if ((size_t)size != strlen(data)) {
4032
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4033
0
        Py_DECREF(output);
4034
0
        return 0;
4035
0
    }
4036
235k
    *(PyObject**)addr = output;
4037
235k
    return Py_CLEANUP_SUPPORTED;
4038
235k
}
4039
4040
4041
int
4042
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4043
109k
{
4044
109k
    if (arg == NULL) {
4045
0
        Py_DECREF(*(PyObject**)addr);
4046
0
        *(PyObject**)addr = NULL;
4047
0
        return 1;
4048
0
    }
4049
4050
109k
    PyObject *path = PyOS_FSPath(arg);
4051
109k
    if (path == NULL) {
4052
0
        return 0;
4053
0
    }
4054
4055
109k
    PyObject *output = NULL;
4056
109k
    if (PyUnicode_Check(path)) {
4057
109k
        output = path;
4058
109k
    }
4059
0
    else if (PyBytes_Check(path)) {
4060
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4061
0
                                                  PyBytes_GET_SIZE(path));
4062
0
        Py_DECREF(path);
4063
0
        if (!output) {
4064
0
            return 0;
4065
0
        }
4066
0
    }
4067
0
    else {
4068
0
        PyErr_Format(PyExc_TypeError,
4069
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4070
0
                     Py_TYPE(arg)->tp_name);
4071
0
        Py_DECREF(path);
4072
0
        return 0;
4073
0
    }
4074
4075
109k
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4076
109k
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4077
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4078
0
        Py_DECREF(output);
4079
0
        return 0;
4080
0
    }
4081
109k
    *(PyObject**)addr = output;
4082
109k
    return Py_CLEANUP_SUPPORTED;
4083
109k
}
4084
4085
4086
static int unicode_fill_utf8(PyObject *unicode);
4087
4088
4089
static int
4090
unicode_ensure_utf8(PyObject *unicode)
4091
65.9M
{
4092
65.9M
    int err = 0;
4093
65.9M
    if (PyUnicode_UTF8(unicode) == NULL) {
4094
157k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4095
157k
        if (PyUnicode_UTF8(unicode) == NULL) {
4096
157k
            err = unicode_fill_utf8(unicode);
4097
157k
        }
4098
157k
        Py_END_CRITICAL_SECTION();
4099
157k
    }
4100
65.9M
    return err;
4101
65.9M
}
4102
4103
const char *
4104
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4105
65.9M
{
4106
65.9M
    if (!PyUnicode_Check(unicode)) {
4107
0
        PyErr_BadArgument();
4108
0
        if (psize) {
4109
0
            *psize = -1;
4110
0
        }
4111
0
        return NULL;
4112
0
    }
4113
4114
65.9M
    if (unicode_ensure_utf8(unicode) == -1) {
4115
206
        if (psize) {
4116
206
            *psize = -1;
4117
206
        }
4118
206
        return NULL;
4119
206
    }
4120
4121
65.9M
    if (psize) {
4122
65.7M
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4123
65.7M
    }
4124
65.9M
    return PyUnicode_UTF8(unicode);
4125
65.9M
}
4126
4127
const char *
4128
PyUnicode_AsUTF8(PyObject *unicode)
4129
250k
{
4130
250k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4131
250k
}
4132
4133
const char *
4134
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4135
2.88M
{
4136
2.88M
    Py_ssize_t size;
4137
2.88M
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4138
2.88M
    if (s && strlen(s) != (size_t)size) {
4139
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4140
0
        return NULL;
4141
0
    }
4142
2.88M
    return s;
4143
2.88M
}
4144
4145
/*
4146
PyUnicode_GetSize() has been deprecated since Python 3.3
4147
because it returned length of Py_UNICODE.
4148
4149
But this function is part of stable abi, because it doesn't
4150
include Py_UNICODE in signature and it was not excluded from
4151
stable ABI in PEP 384.
4152
*/
4153
PyAPI_FUNC(Py_ssize_t)
4154
PyUnicode_GetSize(PyObject *unicode)
4155
0
{
4156
0
    PyErr_SetString(PyExc_RuntimeError,
4157
0
                    "PyUnicode_GetSize has been removed.");
4158
0
    return -1;
4159
0
}
4160
4161
Py_ssize_t
4162
PyUnicode_GetLength(PyObject *unicode)
4163
19.8k
{
4164
19.8k
    if (!PyUnicode_Check(unicode)) {
4165
0
        PyErr_BadArgument();
4166
0
        return -1;
4167
0
    }
4168
19.8k
    return PyUnicode_GET_LENGTH(unicode);
4169
19.8k
}
4170
4171
Py_UCS4
4172
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4173
43
{
4174
43
    const void *data;
4175
43
    int kind;
4176
4177
43
    if (!PyUnicode_Check(unicode)) {
4178
0
        PyErr_BadArgument();
4179
0
        return (Py_UCS4)-1;
4180
0
    }
4181
43
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4182
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4183
0
        return (Py_UCS4)-1;
4184
0
    }
4185
43
    data = PyUnicode_DATA(unicode);
4186
43
    kind = PyUnicode_KIND(unicode);
4187
43
    return PyUnicode_READ(kind, data, index);
4188
43
}
4189
4190
int
4191
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4192
0
{
4193
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4194
0
        PyErr_BadArgument();
4195
0
        return -1;
4196
0
    }
4197
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4198
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4199
0
        return -1;
4200
0
    }
4201
0
    if (unicode_check_modifiable(unicode))
4202
0
        return -1;
4203
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4204
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4205
0
        return -1;
4206
0
    }
4207
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4208
0
                    index, ch);
4209
0
    return 0;
4210
0
}
4211
4212
const char *
4213
PyUnicode_GetDefaultEncoding(void)
4214
0
{
4215
0
    return "utf-8";
4216
0
}
4217
4218
/* create or adjust a UnicodeDecodeError */
4219
static void
4220
make_decode_exception(PyObject **exceptionObject,
4221
                      const char *encoding,
4222
                      const char *input, Py_ssize_t length,
4223
                      Py_ssize_t startpos, Py_ssize_t endpos,
4224
                      const char *reason)
4225
3.35M
{
4226
3.35M
    if (*exceptionObject == NULL) {
4227
3.11M
        *exceptionObject = PyUnicodeDecodeError_Create(
4228
3.11M
            encoding, input, length, startpos, endpos, reason);
4229
3.11M
    }
4230
235k
    else {
4231
235k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4232
0
            goto onError;
4233
235k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4234
0
            goto onError;
4235
235k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4236
0
            goto onError;
4237
235k
    }
4238
3.35M
    return;
4239
4240
3.35M
onError:
4241
0
    Py_CLEAR(*exceptionObject);
4242
0
}
4243
4244
#ifdef MS_WINDOWS
4245
static int
4246
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4247
{
4248
    if (newsize > *size) {
4249
        wchar_t *newbuf = *buf;
4250
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4251
            PyErr_NoMemory();
4252
            return -1;
4253
        }
4254
        *buf = newbuf;
4255
    }
4256
    *size = newsize;
4257
    return 0;
4258
}
4259
4260
/* error handling callback helper:
4261
   build arguments, call the callback and check the arguments,
4262
   if no exception occurred, copy the replacement to the output
4263
   and adjust various state variables.
4264
   return 0 on success, -1 on error
4265
*/
4266
4267
static int
4268
unicode_decode_call_errorhandler_wchar(
4269
    const char *errors, PyObject **errorHandler,
4270
    const char *encoding, const char *reason,
4271
    const char **input, const char **inend, Py_ssize_t *startinpos,
4272
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4273
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4274
{
4275
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4276
4277
    PyObject *restuple = NULL;
4278
    PyObject *repunicode = NULL;
4279
    Py_ssize_t outsize;
4280
    Py_ssize_t insize;
4281
    Py_ssize_t requiredsize;
4282
    Py_ssize_t newpos;
4283
    PyObject *inputobj = NULL;
4284
    Py_ssize_t repwlen;
4285
4286
    if (*errorHandler == NULL) {
4287
        *errorHandler = PyCodec_LookupError(errors);
4288
        if (*errorHandler == NULL)
4289
            goto onError;
4290
    }
4291
4292
    make_decode_exception(exceptionObject,
4293
        encoding,
4294
        *input, *inend - *input,
4295
        *startinpos, *endinpos,
4296
        reason);
4297
    if (*exceptionObject == NULL)
4298
        goto onError;
4299
4300
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4301
    if (restuple == NULL)
4302
        goto onError;
4303
    if (!PyTuple_Check(restuple)) {
4304
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4305
        goto onError;
4306
    }
4307
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4308
        goto onError;
4309
4310
    /* Copy back the bytes variables, which might have been modified by the
4311
       callback */
4312
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4313
    if (!inputobj)
4314
        goto onError;
4315
    *input = PyBytes_AS_STRING(inputobj);
4316
    insize = PyBytes_GET_SIZE(inputobj);
4317
    *inend = *input + insize;
4318
    /* we can DECREF safely, as the exception has another reference,
4319
       so the object won't go away. */
4320
    Py_DECREF(inputobj);
4321
4322
    if (newpos<0)
4323
        newpos = insize+newpos;
4324
    if (newpos<0 || newpos>insize) {
4325
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4326
        goto onError;
4327
    }
4328
4329
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4330
    if (repwlen < 0)
4331
        goto onError;
4332
    repwlen--;
4333
    /* need more space? (at least enough for what we
4334
       have+the replacement+the rest of the string (starting
4335
       at the new input position), so we won't have to check space
4336
       when there are no errors in the rest of the string) */
4337
    requiredsize = *outpos;
4338
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4339
        goto overflow;
4340
    requiredsize += repwlen;
4341
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4342
        goto overflow;
4343
    requiredsize += insize - newpos;
4344
    outsize = *bufsize;
4345
    if (requiredsize > outsize) {
4346
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4347
            requiredsize = 2*outsize;
4348
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4349
            goto onError;
4350
        }
4351
    }
4352
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4353
    *outpos += repwlen;
4354
    *endinpos = newpos;
4355
    *inptr = *input + newpos;
4356
4357
    /* we made it! */
4358
    Py_DECREF(restuple);
4359
    return 0;
4360
4361
  overflow:
4362
    PyErr_SetString(PyExc_OverflowError,
4363
                    "decoded result is too long for a Python string");
4364
4365
  onError:
4366
    Py_XDECREF(restuple);
4367
    return -1;
4368
}
4369
#endif   /* MS_WINDOWS */
4370
4371
static int
4372
unicode_decode_call_errorhandler_writer(
4373
    const char *errors, PyObject **errorHandler,
4374
    const char *encoding, const char *reason,
4375
    const char **input, const char **inend, Py_ssize_t *startinpos,
4376
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4377
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4378
3.35M
{
4379
3.35M
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4380
4381
3.35M
    PyObject *restuple = NULL;
4382
3.35M
    PyObject *repunicode = NULL;
4383
3.35M
    Py_ssize_t insize;
4384
3.35M
    Py_ssize_t newpos;
4385
3.35M
    Py_ssize_t replen;
4386
3.35M
    Py_ssize_t remain;
4387
3.35M
    PyObject *inputobj = NULL;
4388
3.35M
    int need_to_grow = 0;
4389
3.35M
    const char *new_inptr;
4390
4391
3.35M
    if (*errorHandler == NULL) {
4392
3.11M
        *errorHandler = PyCodec_LookupError(errors);
4393
3.11M
        if (*errorHandler == NULL)
4394
0
            goto onError;
4395
3.11M
    }
4396
4397
3.35M
    make_decode_exception(exceptionObject,
4398
3.35M
        encoding,
4399
3.35M
        *input, *inend - *input,
4400
3.35M
        *startinpos, *endinpos,
4401
3.35M
        reason);
4402
3.35M
    if (*exceptionObject == NULL)
4403
0
        goto onError;
4404
4405
3.35M
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4406
3.35M
    if (restuple == NULL)
4407
3.07M
        goto onError;
4408
276k
    if (!PyTuple_Check(restuple)) {
4409
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4410
0
        goto onError;
4411
0
    }
4412
276k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4413
0
        goto onError;
4414
4415
    /* Copy back the bytes variables, which might have been modified by the
4416
       callback */
4417
276k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4418
276k
    if (!inputobj)
4419
0
        goto onError;
4420
276k
    remain = *inend - *input - *endinpos;
4421
276k
    *input = PyBytes_AS_STRING(inputobj);
4422
276k
    insize = PyBytes_GET_SIZE(inputobj);
4423
276k
    *inend = *input + insize;
4424
    /* we can DECREF safely, as the exception has another reference,
4425
       so the object won't go away. */
4426
276k
    Py_DECREF(inputobj);
4427
4428
276k
    if (newpos<0)
4429
0
        newpos = insize+newpos;
4430
276k
    if (newpos<0 || newpos>insize) {
4431
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4432
0
        goto onError;
4433
0
    }
4434
4435
276k
    replen = PyUnicode_GET_LENGTH(repunicode);
4436
276k
    if (replen > 1) {
4437
33.7k
        writer->min_length += replen - 1;
4438
33.7k
        need_to_grow = 1;
4439
33.7k
    }
4440
276k
    new_inptr = *input + newpos;
4441
276k
    if (*inend - new_inptr > remain) {
4442
        /* We don't know the decoding algorithm here so we make the worst
4443
           assumption that one byte decodes to one unicode character.
4444
           If unfortunately one byte could decode to more unicode characters,
4445
           the decoder may write out-of-bound then.  Is it possible for the
4446
           algorithms using this function? */
4447
18.2k
        writer->min_length += *inend - new_inptr - remain;
4448
18.2k
        need_to_grow = 1;
4449
18.2k
    }
4450
276k
    if (need_to_grow) {
4451
33.8k
        writer->overallocate = 1;
4452
33.8k
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4453
33.8k
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4454
0
            goto onError;
4455
33.8k
    }
4456
276k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4457
0
        goto onError;
4458
4459
276k
    *endinpos = newpos;
4460
276k
    *inptr = new_inptr;
4461
4462
    /* we made it! */
4463
276k
    Py_DECREF(restuple);
4464
276k
    return 0;
4465
4466
3.07M
  onError:
4467
3.07M
    Py_XDECREF(restuple);
4468
3.07M
    return -1;
4469
276k
}
4470
4471
/* --- UTF-7 Codec -------------------------------------------------------- */
4472
4473
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4474
4475
/* Three simple macros defining base-64. */
4476
4477
/* Is c a base-64 character? */
4478
4479
#define IS_BASE64(c) \
4480
309k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4481
309k
     ((c) >= 'a' && (c) <= 'z') ||     \
4482
309k
     ((c) >= '0' && (c) <= '9') ||     \
4483
309k
     (c) == '+' || (c) == '/')
4484
4485
/* given that c is a base-64 character, what is its base-64 value? */
4486
4487
#define FROM_BASE64(c)                                                  \
4488
273k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4489
273k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4490
223k
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4491
133k
     (c) == '+' ? 62 : 63)
4492
4493
/* What is the base-64 character of the bottom 6 bits of n? */
4494
4495
#define TO_BASE64(n)  \
4496
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4497
4498
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4499
 * decoded as itself.  We are permissive on decoding; the only ASCII
4500
 * byte not decoding to itself is the + which begins a base64
4501
 * string. */
4502
4503
#define DECODE_DIRECT(c)                                \
4504
7.14M
    ((c) <= 127 && (c) != '+')
4505
4506
/* The UTF-7 encoder treats ASCII characters differently according to
4507
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4508
 * the above).  See RFC2152.  This array identifies these different
4509
 * sets:
4510
 * 0 : "Set D"
4511
 *     alphanumeric and '(),-./:?
4512
 * 1 : "Set O"
4513
 *     !"#$%&*;<=>@[]^_`{|}
4514
 * 2 : "whitespace"
4515
 *     ht nl cr sp
4516
 * 3 : special (must be base64 encoded)
4517
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4518
 */
4519
4520
static
4521
char utf7_category[128] = {
4522
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4523
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4524
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4525
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4526
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4527
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4528
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4529
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4530
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4531
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4532
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4533
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4534
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4535
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4536
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4537
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4538
};
4539
4540
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4541
 * answer depends on whether we are encoding set O as itself, and also
4542
 * on whether we are encoding whitespace as itself.  RFC 2152 makes it
4543
 * clear that the answers to these questions vary between
4544
 * applications, so this code needs to be flexible.  */
4545
4546
#define ENCODE_DIRECT(c) \
4547
0
    ((c) < 128 && (c) > 0 && ((utf7_category[(c)] != 3)))
4548
4549
PyObject *
4550
PyUnicode_DecodeUTF7(const char *s,
4551
                     Py_ssize_t size,
4552
                     const char *errors)
4553
0
{
4554
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4555
0
}
4556
4557
/* The decoder.  The only state we preserve is our read position,
4558
 * i.e. how many characters we have consumed.  So if we end in the
4559
 * middle of a shift sequence we have to back off the read position
4560
 * and the output to the beginning of the sequence, otherwise we lose
4561
 * all the shift state (seen bits, number of bits seen, high
4562
 * surrogate). */
4563
4564
PyObject *
4565
PyUnicode_DecodeUTF7Stateful(const char *s,
4566
                             Py_ssize_t size,
4567
                             const char *errors,
4568
                             Py_ssize_t *consumed)
4569
24.3k
{
4570
24.3k
    const char *starts = s;
4571
24.3k
    Py_ssize_t startinpos;
4572
24.3k
    Py_ssize_t endinpos;
4573
24.3k
    const char *e;
4574
24.3k
    _PyUnicodeWriter writer;
4575
24.3k
    const char *errmsg = "";
4576
24.3k
    int inShift = 0;
4577
24.3k
    Py_ssize_t shiftOutStart;
4578
24.3k
    unsigned int base64bits = 0;
4579
24.3k
    unsigned long base64buffer = 0;
4580
24.3k
    Py_UCS4 surrogate = 0;
4581
24.3k
    PyObject *errorHandler = NULL;
4582
24.3k
    PyObject *exc = NULL;
4583
4584
24.3k
    if (size == 0) {
4585
0
        if (consumed)
4586
0
            *consumed = 0;
4587
0
        _Py_RETURN_UNICODE_EMPTY();
4588
0
    }
4589
4590
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4591
24.3k
    _PyUnicodeWriter_Init(&writer);
4592
24.3k
    writer.min_length = size;
4593
4594
24.3k
    shiftOutStart = 0;
4595
24.3k
    e = s + size;
4596
4597
7.46M
    while (s < e) {
4598
7.45M
        Py_UCS4 ch;
4599
7.45M
      restart:
4600
7.45M
        ch = (unsigned char) *s;
4601
4602
7.45M
        if (inShift) { /* in a base-64 section */
4603
288k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4604
273k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4605
273k
                base64bits += 6;
4606
273k
                s++;
4607
273k
                if (base64bits >= 16) {
4608
                    /* we have enough bits for a UTF-16 value */
4609
97.4k
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4610
97.4k
                    base64bits -= 16;
4611
97.4k
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4612
97.4k
                    assert(outCh <= 0xffff);
4613
97.4k
                    if (surrogate) {
4614
                        /* expecting a second surrogate */
4615
8.27k
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4616
2.80k
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4617
2.80k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4618
0
                                goto onError;
4619
2.80k
                            surrogate = 0;
4620
2.80k
                            continue;
4621
2.80k
                        }
4622
5.46k
                        else {
4623
5.46k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4624
0
                                goto onError;
4625
5.46k
                            surrogate = 0;
4626
5.46k
                        }
4627
8.27k
                    }
4628
94.6k
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4629
                        /* first surrogate */
4630
11.4k
                        surrogate = outCh;
4631
11.4k
                    }
4632
83.2k
                    else {
4633
83.2k
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4634
0
                            goto onError;
4635
83.2k
                    }
4636
94.6k
                }
4637
273k
            }
4638
15.1k
            else { /* now leaving a base-64 section */
4639
15.1k
                inShift = 0;
4640
15.1k
                if (base64bits > 0) { /* left-over bits */
4641
11.8k
                    if (base64bits >= 6) {
4642
                        /* We've seen at least one base-64 character */
4643
5.44k
                        s++;
4644
5.44k
                        errmsg = "partial character in shift sequence";
4645
5.44k
                        goto utf7Error;
4646
5.44k
                    }
4647
6.43k
                    else {
4648
                        /* Some bits remain; they should be zero */
4649
6.43k
                        if (base64buffer != 0) {
4650
1.50k
                            s++;
4651
1.50k
                            errmsg = "non-zero padding bits in shift sequence";
4652
1.50k
                            goto utf7Error;
4653
1.50k
                        }
4654
6.43k
                    }
4655
11.8k
                }
4656
8.16k
                if (surrogate && DECODE_DIRECT(ch)) {
4657
2.31k
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4658
0
                        goto onError;
4659
2.31k
                }
4660
8.16k
                surrogate = 0;
4661
8.16k
                if (ch == '-') {
4662
                    /* '-' is absorbed; other terminating
4663
                       characters are preserved */
4664
2.00k
                    s++;
4665
2.00k
                }
4666
8.16k
            }
4667
288k
        }
4668
7.16M
        else if ( ch == '+' ) {
4669
23.7k
            startinpos = s-starts;
4670
23.7k
            s++; /* consume '+' */
4671
23.7k
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4672
2.71k
                s++;
4673
2.71k
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4674
0
                    goto onError;
4675
2.71k
            }
4676
21.0k
            else if (s < e && !IS_BASE64(*s)) {
4677
2.53k
                s++;
4678
2.53k
                errmsg = "ill-formed sequence";
4679
2.53k
                goto utf7Error;
4680
2.53k
            }
4681
18.4k
            else { /* begin base64-encoded section */
4682
18.4k
                inShift = 1;
4683
18.4k
                surrogate = 0;
4684
18.4k
                shiftOutStart = writer.pos;
4685
18.4k
                base64bits = 0;
4686
18.4k
                base64buffer = 0;
4687
18.4k
            }
4688
23.7k
        }
4689
7.14M
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4690
7.02M
            s++;
4691
7.02M
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4692
0
                goto onError;
4693
7.02M
        }
4694
113k
        else {
4695
113k
            startinpos = s-starts;
4696
113k
            s++;
4697
113k
            errmsg = "unexpected special character";
4698
113k
            goto utf7Error;
4699
113k
        }
4700
7.32M
        continue;
4701
7.32M
utf7Error:
4702
123k
        endinpos = s-starts;
4703
123k
        if (unicode_decode_call_errorhandler_writer(
4704
123k
                errors, &errorHandler,
4705
123k
                "utf7", errmsg,
4706
123k
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4707
123k
                &writer))
4708
10.7k
            goto onError;
4709
123k
    }
4710
4711
    /* end of string */
4712
4713
13.5k
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4714
        /* if we're in an inconsistent state, that's an error */
4715
3.35k
        inShift = 0;
4716
3.35k
        if (surrogate ||
4717
2.99k
                (base64bits >= 6) ||
4718
1.95k
                (base64bits > 0 && base64buffer != 0)) {
4719
1.95k
            endinpos = size;
4720
1.95k
            if (unicode_decode_call_errorhandler_writer(
4721
1.95k
                    errors, &errorHandler,
4722
1.95k
                    "utf7", "unterminated shift sequence",
4723
1.95k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4724
1.95k
                    &writer))
4725
1.60k
                goto onError;
4726
346
            if (s < e)
4727
0
                goto restart;
4728
346
        }
4729
3.35k
    }
4730
4731
    /* return state */
4732
11.9k
    if (consumed) {
4733
0
        if (inShift) {
4734
0
            *consumed = startinpos;
4735
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4736
0
                PyObject *result = PyUnicode_FromKindAndData(
4737
0
                        writer.kind, writer.data, shiftOutStart);
4738
0
                Py_XDECREF(errorHandler);
4739
0
                Py_XDECREF(exc);
4740
0
                _PyUnicodeWriter_Dealloc(&writer);
4741
0
                return result;
4742
0
            }
4743
0
            writer.pos = shiftOutStart; /* back off output */
4744
0
        }
4745
0
        else {
4746
0
            *consumed = s-starts;
4747
0
        }
4748
0
    }
4749
4750
11.9k
    Py_XDECREF(errorHandler);
4751
11.9k
    Py_XDECREF(exc);
4752
11.9k
    return _PyUnicodeWriter_Finish(&writer);
4753
4754
12.3k
  onError:
4755
12.3k
    Py_XDECREF(errorHandler);
4756
12.3k
    Py_XDECREF(exc);
4757
12.3k
    _PyUnicodeWriter_Dealloc(&writer);
4758
12.3k
    return NULL;
4759
11.9k
}
4760
4761
4762
PyObject *
4763
_PyUnicode_EncodeUTF7(PyObject *str,
4764
                      const char *errors)
4765
0
{
4766
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
4767
0
    if (len == 0) {
4768
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
4769
0
    }
4770
0
    int kind = PyUnicode_KIND(str);
4771
0
    const void *data = PyUnicode_DATA(str);
4772
4773
    /* It might be possible to tighten this worst case */
4774
0
    if (len > PY_SSIZE_T_MAX / 8) {
4775
0
        return PyErr_NoMemory();
4776
0
    }
4777
0
    PyBytesWriter *writer = PyBytesWriter_Create(len * 8);
4778
0
    if (writer == NULL) {
4779
0
        return NULL;
4780
0
    }
4781
4782
0
    int inShift = 0;
4783
0
    unsigned int base64bits = 0;
4784
0
    unsigned long base64buffer = 0;
4785
0
    char *out = PyBytesWriter_GetData(writer);
4786
0
    for (Py_ssize_t i = 0; i < len; ++i) {
4787
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4788
4789
0
        if (inShift) {
4790
0
            if (ENCODE_DIRECT(ch)) {
4791
                /* shifting out */
4792
0
                if (base64bits) { /* output remaining bits */
4793
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4794
0
                    base64buffer = 0;
4795
0
                    base64bits = 0;
4796
0
                }
4797
0
                inShift = 0;
4798
                /* Characters not in the BASE64 set implicitly unshift the sequence
4799
                   so no '-' is required, except if the character is itself a '-' */
4800
0
                if (IS_BASE64(ch) || ch == '-') {
4801
0
                    *out++ = '-';
4802
0
                }
4803
0
                *out++ = (char) ch;
4804
0
            }
4805
0
            else {
4806
0
                goto encode_char;
4807
0
            }
4808
0
        }
4809
0
        else { /* not in a shift sequence */
4810
0
            if (ch == '+') {
4811
0
                *out++ = '+';
4812
0
                        *out++ = '-';
4813
0
            }
4814
0
            else if (ENCODE_DIRECT(ch)) {
4815
0
                *out++ = (char) ch;
4816
0
            }
4817
0
            else {
4818
0
                *out++ = '+';
4819
0
                inShift = 1;
4820
0
                goto encode_char;
4821
0
            }
4822
0
        }
4823
0
        continue;
4824
0
encode_char:
4825
0
        if (ch >= 0x10000) {
4826
0
            assert(ch <= MAX_UNICODE);
4827
4828
            /* code first surrogate */
4829
0
            base64bits += 16;
4830
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4831
0
            while (base64bits >= 6) {
4832
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4833
0
                base64bits -= 6;
4834
0
            }
4835
            /* prepare second surrogate */
4836
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4837
0
        }
4838
0
        base64bits += 16;
4839
0
        base64buffer = (base64buffer << 16) | ch;
4840
0
        while (base64bits >= 6) {
4841
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4842
0
            base64bits -= 6;
4843
0
        }
4844
0
    }
4845
0
    if (base64bits)
4846
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4847
0
    if (inShift)
4848
0
        *out++ = '-';
4849
0
    return PyBytesWriter_FinishWithPointer(writer, out);
4850
0
}
4851
4852
#undef IS_BASE64
4853
#undef FROM_BASE64
4854
#undef TO_BASE64
4855
#undef DECODE_DIRECT
4856
#undef ENCODE_DIRECT
4857
4858
/* --- UTF-8 Codec -------------------------------------------------------- */
4859
4860
PyObject *
4861
PyUnicode_DecodeUTF8(const char *s,
4862
                     Py_ssize_t size,
4863
                     const char *errors)
4864
76.3M
{
4865
76.3M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4866
76.3M
}
4867
4868
#include "stringlib/asciilib.h"
4869
#include "stringlib/codecs.h"
4870
#include "stringlib/undef.h"
4871
4872
#include "stringlib/ucs1lib.h"
4873
#include "stringlib/codecs.h"
4874
#include "stringlib/undef.h"
4875
4876
#include "stringlib/ucs2lib.h"
4877
#include "stringlib/codecs.h"
4878
#include "stringlib/undef.h"
4879
4880
#include "stringlib/ucs4lib.h"
4881
#include "stringlib/codecs.h"
4882
#include "stringlib/undef.h"
4883
4884
#if (SIZEOF_SIZE_T == 8)
4885
/* Mask to quickly check whether a C 'size_t' contains a
4886
   non-ASCII, UTF8-encoded char. */
4887
191M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4888
// used to count codepoints in UTF-8 string.
4889
52.5M
# define VECTOR_0101     0x0101010101010101ULL
4890
936k
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
4891
#elif (SIZEOF_SIZE_T == 4)
4892
# define ASCII_CHAR_MASK 0x80808080U
4893
# define VECTOR_0101     0x01010101U
4894
# define VECTOR_00FF     0x00ff00ffU
4895
#else
4896
# error C 'size_t' size should be either 4 or 8!
4897
#endif
4898
4899
#if (defined(__clang__) || defined(__GNUC__))
4900
#define HAVE_CTZ 1
4901
static inline unsigned int
4902
ctz(size_t v)
4903
12.8M
{
4904
12.8M
    return __builtin_ctzll((unsigned long long)v);
4905
12.8M
}
4906
#elif defined(_MSC_VER)
4907
#define HAVE_CTZ 1
4908
static inline unsigned int
4909
ctz(size_t v)
4910
{
4911
    unsigned long pos;
4912
#if SIZEOF_SIZE_T == 4
4913
    _BitScanForward(&pos, v);
4914
#else
4915
    _BitScanForward64(&pos, v);
4916
#endif /* SIZEOF_SIZE_T */
4917
    return pos;
4918
}
4919
#else
4920
#define HAVE_CTZ 0
4921
#endif
4922
4923
#if HAVE_CTZ && PY_LITTLE_ENDIAN
4924
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
4925
static size_t
4926
load_unaligned(const unsigned char *p, size_t size)
4927
64.6M
{
4928
64.6M
    union {
4929
64.6M
        size_t s;
4930
64.6M
        unsigned char b[SIZEOF_SIZE_T];
4931
64.6M
    } u;
4932
64.6M
    u.s = 0;
4933
    // This switch statement assumes little endian because:
4934
    // * union is faster than bitwise or and shift.
4935
    // * big endian machine is rare and hard to maintain.
4936
64.6M
    switch (size) {
4937
0
    default:
4938
0
#if SIZEOF_SIZE_T == 8
4939
0
    case 8:
4940
0
        u.b[7] = p[7];
4941
0
        _Py_FALLTHROUGH;
4942
3.41M
    case 7:
4943
3.41M
        u.b[6] = p[6];
4944
3.41M
        _Py_FALLTHROUGH;
4945
11.0M
    case 6:
4946
11.0M
        u.b[5] = p[5];
4947
11.0M
        _Py_FALLTHROUGH;
4948
20.6M
    case 5:
4949
20.6M
        u.b[4] = p[4];
4950
20.6M
        _Py_FALLTHROUGH;
4951
20.6M
#endif
4952
26.4M
    case 4:
4953
26.4M
        u.b[3] = p[3];
4954
26.4M
        _Py_FALLTHROUGH;
4955
42.9M
    case 3:
4956
42.9M
        u.b[2] = p[2];
4957
42.9M
        _Py_FALLTHROUGH;
4958
55.9M
    case 2:
4959
55.9M
        u.b[1] = p[1];
4960
55.9M
        _Py_FALLTHROUGH;
4961
57.4M
    case 1:
4962
57.4M
        u.b[0] = p[0];
4963
57.4M
        break;
4964
7.26M
    case 0:
4965
7.26M
        break;
4966
64.6M
    }
4967
64.6M
    return u.s;
4968
64.6M
}
4969
#endif
4970
4971
/*
4972
 * Find the first non-ASCII character in a byte sequence.
4973
 *
4974
 * This function scans a range of bytes from `start` to `end` and returns the
4975
 * index of the first byte that is not an ASCII character (i.e., has the most
4976
 * significant bit set). If all characters in the range are ASCII, it returns
4977
 * `end - start`.
4978
 */
4979
static Py_ssize_t
4980
find_first_nonascii(const unsigned char *start, const unsigned char *end)
4981
70.9M
{
4982
    // The search is done in `size_t` chunks.
4983
    // The start and end might not be aligned at `size_t` boundaries,
4984
    // so they're handled specially.
4985
4986
70.9M
    const unsigned char *p = start;
4987
4988
70.9M
    if (end - start >= SIZEOF_SIZE_T) {
4989
        // Avoid unaligned read.
4990
24.0M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
4991
24.0M
        size_t u;
4992
24.0M
        memcpy(&u, p, sizeof(size_t));
4993
24.0M
        u &= ASCII_CHAR_MASK;
4994
24.0M
        if (u) {
4995
4.93M
            return (ctz(u) - 7) / 8;
4996
4.93M
        }
4997
19.0M
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
4998
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
4999
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
5000
        while (p < p2) {
5001
            if (*p & 0x80) {
5002
                return p - start;
5003
            }
5004
            p++;
5005
        }
5006
#endif
5007
5008
19.0M
        const unsigned char *e = end - SIZEOF_SIZE_T;
5009
112M
        while (p <= e) {
5010
94.3M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5011
94.3M
            if (u) {
5012
1.35M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5013
1.35M
                return p - start + (ctz(u) - 7) / 8;
5014
#else
5015
                // big endian and minor compilers are difficult to test.
5016
                // fallback to per byte check.
5017
                break;
5018
#endif
5019
1.35M
            }
5020
93.0M
            p += SIZEOF_SIZE_T;
5021
93.0M
        }
5022
19.0M
    }
5023
64.6M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5024
70.9M
    assert((end - p) < SIZEOF_SIZE_T);
5025
    // we can not use *(const size_t*)p to avoid buffer overrun.
5026
64.6M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5027
64.6M
    if (u) {
5028
6.51M
        return p - start + (ctz(u) - 7) / 8;
5029
6.51M
    }
5030
58.1M
    return end - start;
5031
#else
5032
    while (p < end) {
5033
        if (*p & 0x80) {
5034
            break;
5035
        }
5036
        p++;
5037
    }
5038
    return p - start;
5039
#endif
5040
64.6M
}
5041
5042
static inline int
5043
scalar_utf8_start_char(unsigned int ch)
5044
872k
{
5045
    // 0xxxxxxx or 11xxxxxx are first byte.
5046
872k
    return (~ch >> 7 | ch >> 6) & 1;
5047
872k
}
5048
5049
static inline size_t
5050
vector_utf8_start_chars(size_t v)
5051
52.5M
{
5052
52.5M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5053
52.5M
}
5054
5055
5056
// Count the number of UTF-8 code points in a given byte sequence.
5057
static Py_ssize_t
5058
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5059
347k
{
5060
347k
    Py_ssize_t len = 0;
5061
5062
347k
    if (end - s >= SIZEOF_SIZE_T) {
5063
289k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5064
17.7k
            len += scalar_utf8_start_char(*s++);
5065
17.7k
        }
5066
5067
740k
        while (s + SIZEOF_SIZE_T <= end) {
5068
468k
            const unsigned char *e = end;
5069
468k
            if (e - s > SIZEOF_SIZE_T * 255) {
5070
198k
                e = s + SIZEOF_SIZE_T * 255;
5071
198k
            }
5072
468k
            Py_ssize_t vstart = 0;
5073
52.9M
            while (s + SIZEOF_SIZE_T <= e) {
5074
52.5M
                size_t v = *(size_t*)s;
5075
52.5M
                size_t vs = vector_utf8_start_chars(v);
5076
52.5M
                vstart += vs;
5077
52.5M
                s += SIZEOF_SIZE_T;
5078
52.5M
            }
5079
468k
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5080
468k
            vstart += vstart >> 16;
5081
468k
#if SIZEOF_SIZE_T == 8
5082
468k
            vstart += vstart >> 32;
5083
468k
#endif
5084
468k
            len += vstart & 0x7ff;
5085
468k
        }
5086
271k
    }
5087
1.20M
    while (s < end) {
5088
854k
        len += scalar_utf8_start_char(*s++);
5089
854k
    }
5090
347k
    return len;
5091
347k
}
5092
5093
static Py_ssize_t
5094
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5095
16.6M
{
5096
16.6M
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5097
16.6M
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5098
16.4M
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5099
12.8M
    {
5100
        /* Fast path, see in STRINGLIB(utf8_decode) for
5101
           an explanation. */
5102
12.8M
        const char *p = start;
5103
12.8M
        Py_UCS1 *q = dest;
5104
18.3M
        while (p + SIZEOF_SIZE_T <= end) {
5105
8.69M
            size_t value = *(const size_t *) p;
5106
8.69M
            if (value & ASCII_CHAR_MASK)
5107
3.15M
                break;
5108
5.53M
            *((size_t *)q) = value;
5109
5.53M
            p += SIZEOF_SIZE_T;
5110
5.53M
            q += SIZEOF_SIZE_T;
5111
5.53M
        }
5112
56.5M
        while (p < end) {
5113
46.9M
            if ((unsigned char)*p & 0x80)
5114
3.17M
                break;
5115
43.7M
            *q++ = *p++;
5116
43.7M
        }
5117
12.8M
        return p - start;
5118
12.8M
    }
5119
3.79M
#endif
5120
3.79M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5121
3.79M
                                         (const unsigned char*)end);
5122
3.79M
    memcpy(dest, start, pos);
5123
3.79M
    return pos;
5124
16.6M
}
5125
5126
static int
5127
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5128
                         const char *starts, const char *s, const char *end,
5129
                         _Py_error_handler error_handler,
5130
                         const char *errors,
5131
                         Py_ssize_t *consumed)
5132
12.8M
{
5133
12.8M
    Py_ssize_t startinpos, endinpos;
5134
12.8M
    const char *errmsg = "";
5135
12.8M
    PyObject *error_handler_obj = NULL;
5136
12.8M
    PyObject *exc = NULL;
5137
5138
343M
    while (s < end) {
5139
339M
        Py_UCS4 ch;
5140
339M
        int kind = writer->kind;
5141
5142
339M
        if (kind == PyUnicode_1BYTE_KIND) {
5143
12.9M
            if (PyUnicode_IS_ASCII(writer->buffer))
5144
12.4M
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5145
498k
            else
5146
498k
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5147
326M
        } else if (kind == PyUnicode_2BYTE_KIND) {
5148
112M
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5149
213M
        } else {
5150
213M
            assert(kind == PyUnicode_4BYTE_KIND);
5151
213M
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5152
213M
        }
5153
5154
339M
        switch (ch) {
5155
8.41M
        case 0:
5156
8.41M
            if (s == end || consumed)
5157
8.38M
                goto End;
5158
25.7k
            errmsg = "unexpected end of data";
5159
25.7k
            startinpos = s - starts;
5160
25.7k
            endinpos = end - starts;
5161
25.7k
            break;
5162
242M
        case 1:
5163
242M
            errmsg = "invalid start byte";
5164
242M
            startinpos = s - starts;
5165
242M
            endinpos = startinpos + 1;
5166
242M
            break;
5167
73.6M
        case 2:
5168
73.6M
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5169
0
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5170
0
            {
5171
                /* Truncated surrogate code in range D800-DFFF */
5172
0
                goto End;
5173
0
            }
5174
73.6M
            _Py_FALLTHROUGH;
5175
75.0M
        case 3:
5176
75.2M
        case 4:
5177
75.2M
            errmsg = "invalid continuation byte";
5178
75.2M
            startinpos = s - starts;
5179
75.2M
            endinpos = startinpos + ch - 1;
5180
75.2M
            break;
5181
12.5M
        default:
5182
            // ch doesn't fit into kind, so change the buffer kind to write
5183
            // the character
5184
12.5M
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5185
0
                goto onError;
5186
12.5M
            continue;
5187
339M
        }
5188
5189
318M
        if (error_handler == _Py_ERROR_UNKNOWN)
5190
180k
            error_handler = _Py_GetErrorHandler(errors);
5191
5192
318M
        switch (error_handler) {
5193
0
        case _Py_ERROR_IGNORE:
5194
0
            s += (endinpos - startinpos);
5195
0
            break;
5196
5197
313M
        case _Py_ERROR_REPLACE:
5198
313M
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5199
0
                goto onError;
5200
313M
            s += (endinpos - startinpos);
5201
313M
            break;
5202
5203
4.15M
        case _Py_ERROR_SURROGATEESCAPE:
5204
4.15M
        {
5205
4.15M
            Py_ssize_t i;
5206
5207
4.15M
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5208
0
                goto onError;
5209
8.30M
            for (i=startinpos; i<endinpos; i++) {
5210
4.15M
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5211
4.15M
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5212
4.15M
                                ch + 0xdc00);
5213
4.15M
                writer->pos++;
5214
4.15M
            }
5215
4.15M
            s += (endinpos - startinpos);
5216
4.15M
            break;
5217
4.15M
        }
5218
5219
1.61k
        default:
5220
1.61k
            if (unicode_decode_call_errorhandler_writer(
5221
1.61k
                    errors, &error_handler_obj,
5222
1.61k
                    "utf-8", errmsg,
5223
1.61k
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5224
1.61k
                    writer)) {
5225
1.60k
                goto onError;
5226
1.60k
            }
5227
5228
8
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5229
0
                goto onError;
5230
0
            }
5231
318M
        }
5232
318M
    }
5233
5234
12.8M
End:
5235
12.8M
    if (consumed)
5236
1.19k
        *consumed = s - starts;
5237
5238
12.8M
    Py_XDECREF(error_handler_obj);
5239
12.8M
    Py_XDECREF(exc);
5240
12.8M
    return 0;
5241
5242
1.60k
onError:
5243
1.60k
    Py_XDECREF(error_handler_obj);
5244
1.60k
    Py_XDECREF(exc);
5245
1.60k
    return -1;
5246
12.8M
}
5247
5248
5249
static PyObject *
5250
unicode_decode_utf8(const char *s, Py_ssize_t size,
5251
                    _Py_error_handler error_handler, const char *errors,
5252
                    Py_ssize_t *consumed)
5253
111M
{
5254
111M
    if (size == 0) {
5255
3.29M
        if (consumed) {
5256
0
            *consumed = 0;
5257
0
        }
5258
3.29M
        _Py_RETURN_UNICODE_EMPTY();
5259
3.29M
    }
5260
5261
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5262
108M
    if (size == 1 && (unsigned char)s[0] < 128) {
5263
41.1M
        if (consumed) {
5264
0
            *consumed = 1;
5265
0
        }
5266
41.1M
        return get_latin1_char((unsigned char)s[0]);
5267
41.1M
    }
5268
5269
    // I don't know this check is necessary or not. But there is a test
5270
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5271
67.1M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5272
0
        PyErr_NoMemory();
5273
0
        return NULL;
5274
0
    }
5275
5276
67.1M
    const char *starts = s;
5277
67.1M
    const char *end = s + size;
5278
5279
67.1M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5280
67.1M
    if (pos == size) {  // fast path: ASCII string.
5281
54.4M
        PyObject *u = PyUnicode_New(size, 127);
5282
54.4M
        if (u == NULL) {
5283
0
            return NULL;
5284
0
        }
5285
54.4M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5286
54.4M
        if (consumed) {
5287
108
            *consumed = size;
5288
108
        }
5289
54.4M
        return u;
5290
54.4M
    }
5291
5292
12.7M
    int maxchr = 127;
5293
12.7M
    Py_ssize_t maxsize = size;
5294
5295
12.7M
    unsigned char ch = (unsigned char)(s[pos]);
5296
    // error handler other than strict may remove/replace the invalid byte.
5297
    // consumed != NULL allows 1~3 bytes remainings.
5298
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5299
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5300
    // reallocation and copy.
5301
12.7M
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5302
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5303
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5304
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5305
        // means that it is no longer necessary to allocate several times the required amount
5306
        // of memory.
5307
347k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5308
347k
        if (ch < 0xc4) { // latin1
5309
231k
            maxchr = 0xff;
5310
231k
        }
5311
115k
        else if (ch < 0xf0) { // ucs2
5312
102k
            maxchr = 0xffff;
5313
102k
        }
5314
13.2k
        else { // ucs4
5315
13.2k
            maxchr = 0x10ffff;
5316
13.2k
        }
5317
347k
    }
5318
12.7M
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5319
12.7M
    if (!u) {
5320
0
        return NULL;
5321
0
    }
5322
5323
    // Use _PyUnicodeWriter after fast path is failed.
5324
12.7M
    _PyUnicodeWriter writer;
5325
12.7M
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5326
12.7M
    if (maxchr <= 255) {
5327
12.6M
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5328
12.6M
        s += pos;
5329
12.6M
        writer.pos = pos;
5330
12.6M
    }
5331
5332
12.7M
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5333
12.7M
                                 error_handler, errors,
5334
12.7M
                                 consumed) < 0) {
5335
1.60k
        _PyUnicodeWriter_Dealloc(&writer);
5336
1.60k
        return NULL;
5337
1.60k
    }
5338
12.7M
    return _PyUnicodeWriter_Finish(&writer);
5339
12.7M
}
5340
5341
5342
// Used by PyUnicodeWriter_WriteUTF8() implementation
5343
int
5344
_PyUnicode_DecodeUTF8Writer(_PyUnicodeWriter *writer,
5345
                            const char *s, Py_ssize_t size,
5346
                            _Py_error_handler error_handler, const char *errors,
5347
                            Py_ssize_t *consumed)
5348
3.88M
{
5349
3.88M
    if (size == 0) {
5350
9.09k
        if (consumed) {
5351
0
            *consumed = 0;
5352
0
        }
5353
9.09k
        return 0;
5354
9.09k
    }
5355
5356
    // fast path: try ASCII string.
5357
3.87M
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5358
0
        return -1;
5359
0
    }
5360
5361
3.87M
    const char *starts = s;
5362
3.87M
    const char *end = s + size;
5363
3.87M
    Py_ssize_t decoded = 0;
5364
3.87M
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5365
3.87M
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5366
3.87M
        decoded = ascii_decode(s, end, dest);
5367
3.87M
        writer->pos += decoded;
5368
5369
3.87M
        if (decoded == size) {
5370
3.83M
            if (consumed) {
5371
1.05k
                *consumed = size;
5372
1.05k
            }
5373
3.83M
            return 0;
5374
3.83M
        }
5375
41.2k
        s += decoded;
5376
41.2k
    }
5377
5378
43.5k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5379
43.5k
                                    error_handler, errors, consumed);
5380
3.87M
}
5381
5382
5383
PyObject *
5384
PyUnicode_DecodeUTF8Stateful(const char *s,
5385
                             Py_ssize_t size,
5386
                             const char *errors,
5387
                             Py_ssize_t *consumed)
5388
111M
{
5389
111M
    return unicode_decode_utf8(s, size,
5390
111M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5391
111M
                               errors, consumed);
5392
111M
}
5393
5394
5395
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5396
   non-zero, use strict error handler otherwise.
5397
5398
   On success, write a pointer to a newly allocated wide character string into
5399
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5400
   (in number of wchar_t units) into *wlen (if wlen is set).
5401
5402
   On memory allocation failure, return -1.
5403
5404
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5405
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5406
   is not NULL, write the decoding error message into *reason. */
5407
int
5408
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5409
                 const char **reason, _Py_error_handler errors)
5410
288
{
5411
288
    const char *orig_s = s;
5412
288
    const char *e;
5413
288
    wchar_t *unicode;
5414
288
    Py_ssize_t outpos;
5415
5416
288
    int surrogateescape = 0;
5417
288
    int surrogatepass = 0;
5418
288
    switch (errors)
5419
288
    {
5420
0
    case _Py_ERROR_STRICT:
5421
0
        break;
5422
288
    case _Py_ERROR_SURROGATEESCAPE:
5423
288
        surrogateescape = 1;
5424
288
        break;
5425
0
    case _Py_ERROR_SURROGATEPASS:
5426
0
        surrogatepass = 1;
5427
0
        break;
5428
0
    default:
5429
0
        return -3;
5430
288
    }
5431
5432
    /* Note: size will always be longer than the resulting Unicode
5433
       character count */
5434
288
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5435
0
        return -1;
5436
0
    }
5437
5438
288
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5439
288
    if (!unicode) {
5440
0
        return -1;
5441
0
    }
5442
5443
    /* Unpack UTF-8 encoded data */
5444
288
    e = s + size;
5445
288
    outpos = 0;
5446
288
    while (s < e) {
5447
288
        Py_UCS4 ch;
5448
288
#if SIZEOF_WCHAR_T == 4
5449
288
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5450
#else
5451
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5452
#endif
5453
288
        if (ch > 0xFF) {
5454
0
#if SIZEOF_WCHAR_T == 4
5455
0
            Py_UNREACHABLE();
5456
#else
5457
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5458
            /* write a surrogate pair */
5459
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5460
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5461
#endif
5462
0
        }
5463
288
        else {
5464
288
            if (!ch && s == e) {
5465
288
                break;
5466
288
            }
5467
5468
0
            if (surrogateescape) {
5469
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5470
0
            }
5471
0
            else {
5472
                /* Is it a valid three-byte code? */
5473
0
                if (surrogatepass
5474
0
                    && (e - s) >= 3
5475
0
                    && (s[0] & 0xf0) == 0xe0
5476
0
                    && (s[1] & 0xc0) == 0x80
5477
0
                    && (s[2] & 0xc0) == 0x80)
5478
0
                {
5479
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5480
0
                    s += 3;
5481
0
                    unicode[outpos++] = ch;
5482
0
                }
5483
0
                else {
5484
0
                    PyMem_RawFree(unicode );
5485
0
                    if (reason != NULL) {
5486
0
                        switch (ch) {
5487
0
                        case 0:
5488
0
                            *reason = "unexpected end of data";
5489
0
                            break;
5490
0
                        case 1:
5491
0
                            *reason = "invalid start byte";
5492
0
                            break;
5493
                        /* 2, 3, 4 */
5494
0
                        default:
5495
0
                            *reason = "invalid continuation byte";
5496
0
                            break;
5497
0
                        }
5498
0
                    }
5499
0
                    if (wlen != NULL) {
5500
0
                        *wlen = s - orig_s;
5501
0
                    }
5502
0
                    return -2;
5503
0
                }
5504
0
            }
5505
0
        }
5506
288
    }
5507
288
    unicode[outpos] = L'\0';
5508
288
    if (wlen) {
5509
288
        *wlen = outpos;
5510
288
    }
5511
288
    *wstr = unicode;
5512
288
    return 0;
5513
288
}
5514
5515
5516
wchar_t*
5517
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5518
                               size_t *wlen)
5519
0
{
5520
0
    wchar_t *wstr;
5521
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5522
0
                               &wstr, wlen,
5523
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5524
0
    if (res != 0) {
5525
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5526
0
        assert(res != -3);
5527
0
        if (wlen) {
5528
0
            *wlen = (size_t)res;
5529
0
        }
5530
0
        return NULL;
5531
0
    }
5532
0
    return wstr;
5533
0
}
5534
5535
5536
/* UTF-8 encoder.
5537
5538
   On success, return 0 and write the newly allocated character string (use
5539
   PyMem_Free() to free the memory) into *str.
5540
5541
   On encoding failure, return -2 and write the position of the invalid
5542
   surrogate character into *error_pos (if error_pos is set) and the decoding
5543
   error message into *reason (if reason is set).
5544
5545
   On memory allocation failure, return -1. */
5546
int
5547
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5548
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5549
576
{
5550
576
    const Py_ssize_t max_char_size = 4;
5551
576
    Py_ssize_t len = wcslen(text);
5552
5553
576
    assert(len >= 0);
5554
5555
576
    int surrogateescape = 0;
5556
576
    int surrogatepass = 0;
5557
576
    switch (errors)
5558
576
    {
5559
144
    case _Py_ERROR_STRICT:
5560
144
        break;
5561
432
    case _Py_ERROR_SURROGATEESCAPE:
5562
432
        surrogateescape = 1;
5563
432
        break;
5564
0
    case _Py_ERROR_SURROGATEPASS:
5565
0
        surrogatepass = 1;
5566
0
        break;
5567
0
    default:
5568
0
        return -3;
5569
576
    }
5570
5571
576
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5572
0
        return -1;
5573
0
    }
5574
576
    char *bytes;
5575
576
    if (raw_malloc) {
5576
576
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5577
576
    }
5578
0
    else {
5579
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5580
0
    }
5581
576
    if (bytes == NULL) {
5582
0
        return -1;
5583
0
    }
5584
5585
576
    char *p = bytes;
5586
576
    Py_ssize_t i;
5587
17.9k
    for (i = 0; i < len; ) {
5588
17.3k
        Py_ssize_t ch_pos = i;
5589
17.3k
        Py_UCS4 ch = text[i];
5590
17.3k
        i++;
5591
17.3k
        if (sizeof(wchar_t) == 2
5592
0
            && Py_UNICODE_IS_HIGH_SURROGATE(ch)
5593
0
            && i < len
5594
0
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5595
0
        {
5596
0
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5597
0
            i++;
5598
0
        }
5599
5600
17.3k
        if (ch < 0x80) {
5601
            /* Encode ASCII */
5602
17.3k
            *p++ = (char) ch;
5603
5604
17.3k
        }
5605
0
        else if (ch < 0x0800) {
5606
            /* Encode Latin-1 */
5607
0
            *p++ = (char)(0xc0 | (ch >> 6));
5608
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5609
0
        }
5610
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5611
            /* surrogateescape error handler */
5612
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5613
0
                if (error_pos != NULL) {
5614
0
                    *error_pos = (size_t)ch_pos;
5615
0
                }
5616
0
                if (reason != NULL) {
5617
0
                    *reason = "encoding error";
5618
0
                }
5619
0
                if (raw_malloc) {
5620
0
                    PyMem_RawFree(bytes);
5621
0
                }
5622
0
                else {
5623
0
                    PyMem_Free(bytes);
5624
0
                }
5625
0
                return -2;
5626
0
            }
5627
0
            *p++ = (char)(ch & 0xff);
5628
0
        }
5629
0
        else if (ch < 0x10000) {
5630
0
            *p++ = (char)(0xe0 | (ch >> 12));
5631
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5632
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5633
0
        }
5634
0
        else {  /* ch >= 0x10000 */
5635
0
            assert(ch <= MAX_UNICODE);
5636
            /* Encode UCS4 Unicode ordinals */
5637
0
            *p++ = (char)(0xf0 | (ch >> 18));
5638
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5639
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5640
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5641
0
        }
5642
17.3k
    }
5643
576
    *p++ = '\0';
5644
5645
576
    size_t final_size = (p - bytes);
5646
576
    char *bytes2;
5647
576
    if (raw_malloc) {
5648
576
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5649
576
    }
5650
0
    else {
5651
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5652
0
    }
5653
576
    if (bytes2 == NULL) {
5654
0
        if (error_pos != NULL) {
5655
0
            *error_pos = (size_t)-1;
5656
0
        }
5657
0
        if (raw_malloc) {
5658
0
            PyMem_RawFree(bytes);
5659
0
        }
5660
0
        else {
5661
0
            PyMem_Free(bytes);
5662
0
        }
5663
0
        return -1;
5664
0
    }
5665
576
    *str = bytes2;
5666
576
    return 0;
5667
576
}
5668
5669
5670
/* Primary internal function which creates utf8 encoded bytes objects.
5671
5672
   Allocation strategy:  if the string is short, convert into a stack buffer
5673
   and allocate exactly as much space needed at the end.  Else allocate the
5674
   maximum possible needed (4 result bytes per Unicode character), and return
5675
   the excess memory at the end.
5676
*/
5677
static PyObject *
5678
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5679
                    const char *errors)
5680
18.5M
{
5681
18.5M
    if (!PyUnicode_Check(unicode)) {
5682
0
        PyErr_BadArgument();
5683
0
        return NULL;
5684
0
    }
5685
5686
18.5M
    if (PyUnicode_UTF8(unicode))
5687
9.64M
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5688
9.64M
                                         PyUnicode_UTF8_LENGTH(unicode));
5689
5690
8.89M
    int kind = PyUnicode_KIND(unicode);
5691
8.89M
    const void *data = PyUnicode_DATA(unicode);
5692
8.89M
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5693
5694
8.89M
    PyBytesWriter *writer;
5695
8.89M
    char *end;
5696
5697
8.89M
    switch (kind) {
5698
0
    default:
5699
0
        Py_UNREACHABLE();
5700
5.94M
    case PyUnicode_1BYTE_KIND:
5701
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5702
5.94M
        assert(!PyUnicode_IS_ASCII(unicode));
5703
5.94M
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5704
5.94M
                                      error_handler, errors, &end);
5705
5.94M
        break;
5706
1.73M
    case PyUnicode_2BYTE_KIND:
5707
1.73M
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5708
1.73M
                                      error_handler, errors, &end);
5709
1.73M
        break;
5710
1.21M
    case PyUnicode_4BYTE_KIND:
5711
1.21M
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5712
1.21M
                                      error_handler, errors, &end);
5713
1.21M
        break;
5714
8.89M
    }
5715
5716
8.89M
    if (writer == NULL) {
5717
150k
        PyBytesWriter_Discard(writer);
5718
150k
        return NULL;
5719
150k
    }
5720
8.74M
    return PyBytesWriter_FinishWithPointer(writer, end);
5721
8.89M
}
5722
5723
static int
5724
unicode_fill_utf8(PyObject *unicode)
5725
157k
{
5726
157k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5727
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5728
157k
    assert(!PyUnicode_IS_ASCII(unicode));
5729
5730
157k
    int kind = PyUnicode_KIND(unicode);
5731
157k
    const void *data = PyUnicode_DATA(unicode);
5732
157k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5733
5734
157k
    PyBytesWriter *writer;
5735
157k
    char *end;
5736
5737
157k
    switch (kind) {
5738
0
    default:
5739
0
        Py_UNREACHABLE();
5740
117k
    case PyUnicode_1BYTE_KIND:
5741
117k
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5742
117k
                                      _Py_ERROR_STRICT, NULL, &end);
5743
117k
        break;
5744
32.9k
    case PyUnicode_2BYTE_KIND:
5745
32.9k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5746
32.9k
                                      _Py_ERROR_STRICT, NULL, &end);
5747
32.9k
        break;
5748
7.04k
    case PyUnicode_4BYTE_KIND:
5749
7.04k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5750
7.04k
                                      _Py_ERROR_STRICT, NULL, &end);
5751
7.04k
        break;
5752
157k
    }
5753
157k
    if (writer == NULL) {
5754
206
        return -1;
5755
206
    }
5756
5757
157k
    const char *start = PyBytesWriter_GetData(writer);
5758
157k
    Py_ssize_t len = end - start;
5759
5760
157k
    char *cache = PyMem_Malloc(len + 1);
5761
157k
    if (cache == NULL) {
5762
0
        PyBytesWriter_Discard(writer);
5763
0
        PyErr_NoMemory();
5764
0
        return -1;
5765
0
    }
5766
157k
    memcpy(cache, start, len);
5767
157k
    cache[len] = '\0';
5768
157k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5769
157k
    PyUnicode_SET_UTF8(unicode, cache);
5770
157k
    PyBytesWriter_Discard(writer);
5771
157k
    return 0;
5772
157k
}
5773
5774
PyObject *
5775
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5776
17.6M
{
5777
17.6M
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5778
17.6M
}
5779
5780
5781
PyObject *
5782
PyUnicode_AsUTF8String(PyObject *unicode)
5783
2.56k
{
5784
2.56k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5785
2.56k
}
5786
5787
/* --- UTF-32 Codec ------------------------------------------------------- */
5788
5789
PyObject *
5790
PyUnicode_DecodeUTF32(const char *s,
5791
                      Py_ssize_t size,
5792
                      const char *errors,
5793
                      int *byteorder)
5794
163
{
5795
163
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5796
163
}
5797
5798
PyObject *
5799
PyUnicode_DecodeUTF32Stateful(const char *s,
5800
                              Py_ssize_t size,
5801
                              const char *errors,
5802
                              int *byteorder,
5803
                              Py_ssize_t *consumed)
5804
44.5k
{
5805
44.5k
    const char *starts = s;
5806
44.5k
    Py_ssize_t startinpos;
5807
44.5k
    Py_ssize_t endinpos;
5808
44.5k
    _PyUnicodeWriter writer;
5809
44.5k
    const unsigned char *q, *e;
5810
44.5k
    int le, bo = 0;       /* assume native ordering by default */
5811
44.5k
    const char *encoding;
5812
44.5k
    const char *errmsg = "";
5813
44.5k
    PyObject *errorHandler = NULL;
5814
44.5k
    PyObject *exc = NULL;
5815
5816
44.5k
    q = (const unsigned char *)s;
5817
44.5k
    e = q + size;
5818
5819
44.5k
    if (byteorder)
5820
44.4k
        bo = *byteorder;
5821
5822
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5823
       byte order setting accordingly. In native mode, the leading BOM
5824
       mark is skipped, in all other modes, it is copied to the output
5825
       stream as-is (giving a ZWNBSP character). */
5826
44.5k
    if (bo == 0 && size >= 4) {
5827
42.1k
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5828
42.1k
        if (bom == 0x0000FEFF) {
5829
211
            bo = -1;
5830
211
            q += 4;
5831
211
        }
5832
41.9k
        else if (bom == 0xFFFE0000) {
5833
363
            bo = 1;
5834
363
            q += 4;
5835
363
        }
5836
42.1k
        if (byteorder)
5837
42.0k
            *byteorder = bo;
5838
42.1k
    }
5839
5840
44.5k
    if (q == e) {
5841
122
        if (consumed)
5842
0
            *consumed = size;
5843
122
        _Py_RETURN_UNICODE_EMPTY();
5844
122
    }
5845
5846
#ifdef WORDS_BIGENDIAN
5847
    le = bo < 0;
5848
#else
5849
44.4k
    le = bo <= 0;
5850
44.4k
#endif
5851
44.4k
    encoding = le ? "utf-32-le" : "utf-32-be";
5852
5853
44.4k
    _PyUnicodeWriter_Init(&writer);
5854
44.4k
    writer.min_length = (e - q + 3) / 4;
5855
44.4k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5856
0
        goto onError;
5857
5858
131k
    while (1) {
5859
131k
        Py_UCS4 ch = 0;
5860
131k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5861
5862
131k
        if (e - q >= 4) {
5863
102k
            int kind = writer.kind;
5864
102k
            void *data = writer.data;
5865
102k
            const unsigned char *last = e - 4;
5866
102k
            Py_ssize_t pos = writer.pos;
5867
102k
            if (le) {
5868
2.76M
                do {
5869
2.76M
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5870
2.76M
                    if (ch > maxch)
5871
97.8k
                        break;
5872
2.67M
                    if (kind != PyUnicode_1BYTE_KIND &&
5873
2.64M
                        Py_UNICODE_IS_SURROGATE(ch))
5874
322
                        break;
5875
2.67M
                    PyUnicode_WRITE(kind, data, pos++, ch);
5876
2.67M
                    q += 4;
5877
2.67M
                } while (q <= last);
5878
99.2k
            }
5879
3.54k
            else {
5880
6.26k
                do {
5881
6.26k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5882
6.26k
                    if (ch > maxch)
5883
3.22k
                        break;
5884
3.04k
                    if (kind != PyUnicode_1BYTE_KIND &&
5885
2.55k
                        Py_UNICODE_IS_SURROGATE(ch))
5886
113
                        break;
5887
2.93k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5888
2.93k
                    q += 4;
5889
2.93k
                } while (q <= last);
5890
3.54k
            }
5891
102k
            writer.pos = pos;
5892
102k
        }
5893
5894
131k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
5895
440
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5896
440
            startinpos = ((const char *)q) - starts;
5897
440
            endinpos = startinpos + 4;
5898
440
        }
5899
131k
        else if (ch <= maxch) {
5900
30.3k
            if (q == e || consumed)
5901
5.29k
                break;
5902
            /* remaining bytes at the end? (size should be divisible by 4) */
5903
25.0k
            errmsg = "truncated data";
5904
25.0k
            startinpos = ((const char *)q) - starts;
5905
25.0k
            endinpos = ((const char *)e) - starts;
5906
25.0k
        }
5907
101k
        else {
5908
101k
            if (ch < 0x110000) {
5909
5.04k
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5910
0
                    goto onError;
5911
5.04k
                q += 4;
5912
5.04k
                continue;
5913
5.04k
            }
5914
96.0k
            errmsg = "code point not in range(0x110000)";
5915
96.0k
            startinpos = ((const char *)q) - starts;
5916
96.0k
            endinpos = startinpos + 4;
5917
96.0k
        }
5918
5919
        /* The remaining input chars are ignored if the callback
5920
           chooses to skip the input */
5921
121k
        if (unicode_decode_call_errorhandler_writer(
5922
121k
                errors, &errorHandler,
5923
121k
                encoding, errmsg,
5924
121k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5925
121k
                &writer))
5926
39.1k
            goto onError;
5927
121k
    }
5928
5929
5.29k
    if (consumed)
5930
0
        *consumed = (const char *)q-starts;
5931
5932
5.29k
    Py_XDECREF(errorHandler);
5933
5.29k
    Py_XDECREF(exc);
5934
5.29k
    return _PyUnicodeWriter_Finish(&writer);
5935
5936
39.1k
  onError:
5937
39.1k
    _PyUnicodeWriter_Dealloc(&writer);
5938
39.1k
    Py_XDECREF(errorHandler);
5939
39.1k
    Py_XDECREF(exc);
5940
39.1k
    return NULL;
5941
44.4k
}
5942
5943
PyObject *
5944
_PyUnicode_EncodeUTF32(PyObject *str,
5945
                       const char *errors,
5946
                       int byteorder)
5947
0
{
5948
0
    if (!PyUnicode_Check(str)) {
5949
0
        PyErr_BadArgument();
5950
0
        return NULL;
5951
0
    }
5952
0
    int kind = PyUnicode_KIND(str);
5953
0
    const void *data = PyUnicode_DATA(str);
5954
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
5955
5956
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5957
0
        return PyErr_NoMemory();
5958
0
    Py_ssize_t nsize = len + (byteorder == 0);
5959
5960
0
#if PY_LITTLE_ENDIAN
5961
0
    int native_ordering = byteorder <= 0;
5962
#else
5963
    int native_ordering = byteorder >= 0;
5964
#endif
5965
5966
0
    if (kind == PyUnicode_1BYTE_KIND) {
5967
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
5968
        // on short strings
5969
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5970
0
        if (v == NULL) {
5971
0
            return NULL;
5972
0
        }
5973
5974
        /* output buffer is 4-bytes aligned */
5975
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5976
0
        uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
5977
0
        if (byteorder == 0) {
5978
0
            *out++ = 0xFEFF;
5979
0
        }
5980
0
        if (len > 0) {
5981
0
            ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
5982
0
                                 &out, native_ordering);
5983
0
        }
5984
0
        return v;
5985
0
    }
5986
5987
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
5988
0
    if (writer == NULL) {
5989
0
        return NULL;
5990
0
    }
5991
5992
    /* output buffer is 4-bytes aligned */
5993
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
5994
0
    uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
5995
0
    if (byteorder == 0) {
5996
0
        *out++ = 0xFEFF;
5997
0
    }
5998
0
    if (len == 0) {
5999
0
        return PyBytesWriter_Finish(writer);
6000
0
    }
6001
6002
0
    const char *encoding;
6003
0
    if (byteorder == -1)
6004
0
        encoding = "utf-32-le";
6005
0
    else if (byteorder == 1)
6006
0
        encoding = "utf-32-be";
6007
0
    else
6008
0
        encoding = "utf-32";
6009
6010
0
    PyObject *errorHandler = NULL;
6011
0
    PyObject *exc = NULL;
6012
0
    PyObject *rep = NULL;
6013
6014
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6015
0
        if (kind == PyUnicode_2BYTE_KIND) {
6016
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6017
0
                                        &out, native_ordering);
6018
0
        }
6019
0
        else {
6020
0
            assert(kind == PyUnicode_4BYTE_KIND);
6021
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6022
0
                                        &out, native_ordering);
6023
0
        }
6024
0
        if (pos == len)
6025
0
            break;
6026
6027
0
        Py_ssize_t newpos;
6028
0
        rep = unicode_encode_call_errorhandler(
6029
0
                errors, &errorHandler,
6030
0
                encoding, "surrogates not allowed",
6031
0
                str, &exc, pos, pos + 1, &newpos);
6032
0
        if (!rep)
6033
0
            goto error;
6034
6035
0
        Py_ssize_t repsize, moreunits;
6036
0
        if (PyBytes_Check(rep)) {
6037
0
            repsize = PyBytes_GET_SIZE(rep);
6038
0
            if (repsize & 3) {
6039
0
                raise_encode_exception(&exc, encoding,
6040
0
                                       str, pos, pos + 1,
6041
0
                                       "surrogates not allowed");
6042
0
                goto error;
6043
0
            }
6044
0
            moreunits = repsize / 4;
6045
0
        }
6046
0
        else {
6047
0
            assert(PyUnicode_Check(rep));
6048
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6049
0
            if (!PyUnicode_IS_ASCII(rep)) {
6050
0
                raise_encode_exception(&exc, encoding,
6051
0
                                       str, pos, pos + 1,
6052
0
                                       "surrogates not allowed");
6053
0
                goto error;
6054
0
            }
6055
0
        }
6056
0
        moreunits += pos - newpos;
6057
0
        pos = newpos;
6058
6059
        /* four bytes are reserved for each surrogate */
6060
0
        if (moreunits > 0) {
6061
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
6062
0
            if (out == NULL) {
6063
0
                goto error;
6064
0
            }
6065
0
        }
6066
6067
0
        if (PyBytes_Check(rep)) {
6068
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6069
0
            out += repsize / 4;
6070
0
        }
6071
0
        else {
6072
            /* rep is unicode */
6073
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6074
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6075
0
                                 &out, native_ordering);
6076
0
        }
6077
6078
0
        Py_CLEAR(rep);
6079
0
    }
6080
6081
0
    Py_XDECREF(errorHandler);
6082
0
    Py_XDECREF(exc);
6083
6084
    /* Cut back to size actually needed. This is necessary for, for example,
6085
       encoding of a string containing isolated surrogates and the 'ignore'
6086
       handler is used. */
6087
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6088
6089
0
  error:
6090
0
    Py_XDECREF(rep);
6091
0
    Py_XDECREF(errorHandler);
6092
0
    Py_XDECREF(exc);
6093
0
    PyBytesWriter_Discard(writer);
6094
0
    return NULL;
6095
0
}
6096
6097
PyObject *
6098
PyUnicode_AsUTF32String(PyObject *unicode)
6099
0
{
6100
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6101
0
}
6102
6103
/* --- UTF-16 Codec ------------------------------------------------------- */
6104
6105
PyObject *
6106
PyUnicode_DecodeUTF16(const char *s,
6107
                      Py_ssize_t size,
6108
                      const char *errors,
6109
                      int *byteorder)
6110
177
{
6111
177
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6112
177
}
6113
6114
PyObject *
6115
PyUnicode_DecodeUTF16Stateful(const char *s,
6116
                              Py_ssize_t size,
6117
                              const char *errors,
6118
                              int *byteorder,
6119
                              Py_ssize_t *consumed)
6120
16.4k
{
6121
16.4k
    const char *starts = s;
6122
16.4k
    Py_ssize_t startinpos;
6123
16.4k
    Py_ssize_t endinpos;
6124
16.4k
    _PyUnicodeWriter writer;
6125
16.4k
    const unsigned char *q, *e;
6126
16.4k
    int bo = 0;       /* assume native ordering by default */
6127
16.4k
    int native_ordering;
6128
16.4k
    const char *errmsg = "";
6129
16.4k
    PyObject *errorHandler = NULL;
6130
16.4k
    PyObject *exc = NULL;
6131
16.4k
    const char *encoding;
6132
6133
16.4k
    q = (const unsigned char *)s;
6134
16.4k
    e = q + size;
6135
6136
16.4k
    if (byteorder)
6137
16.2k
        bo = *byteorder;
6138
6139
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6140
       byte order setting accordingly. In native mode, the leading BOM
6141
       mark is skipped, in all other modes, it is copied to the output
6142
       stream as-is (giving a ZWNBSP character). */
6143
16.4k
    if (bo == 0 && size >= 2) {
6144
15.5k
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6145
15.5k
        if (bom == 0xFEFF) {
6146
276
            q += 2;
6147
276
            bo = -1;
6148
276
        }
6149
15.3k
        else if (bom == 0xFFFE) {
6150
2.21k
            q += 2;
6151
2.21k
            bo = 1;
6152
2.21k
        }
6153
15.5k
        if (byteorder)
6154
15.4k
            *byteorder = bo;
6155
15.5k
    }
6156
6157
16.4k
    if (q == e) {
6158
56
        if (consumed)
6159
0
            *consumed = size;
6160
56
        _Py_RETURN_UNICODE_EMPTY();
6161
56
    }
6162
6163
16.3k
#if PY_LITTLE_ENDIAN
6164
16.3k
    native_ordering = bo <= 0;
6165
16.3k
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6166
#else
6167
    native_ordering = bo >= 0;
6168
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6169
#endif
6170
6171
    /* Note: size will always be longer than the resulting Unicode
6172
       character count normally.  Error handler will take care of
6173
       resizing when needed. */
6174
16.3k
    _PyUnicodeWriter_Init(&writer);
6175
16.3k
    writer.min_length = (e - q + 1) / 2;
6176
16.3k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6177
0
        goto onError;
6178
6179
59.9k
    while (1) {
6180
59.9k
        Py_UCS4 ch = 0;
6181
59.9k
        if (e - q >= 2) {
6182
52.0k
            int kind = writer.kind;
6183
52.0k
            if (kind == PyUnicode_1BYTE_KIND) {
6184
19.8k
                if (PyUnicode_IS_ASCII(writer.buffer))
6185
15.7k
                    ch = asciilib_utf16_decode(&q, e,
6186
15.7k
                            (Py_UCS1*)writer.data, &writer.pos,
6187
15.7k
                            native_ordering);
6188
4.10k
                else
6189
4.10k
                    ch = ucs1lib_utf16_decode(&q, e,
6190
4.10k
                            (Py_UCS1*)writer.data, &writer.pos,
6191
4.10k
                            native_ordering);
6192
32.1k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6193
17.5k
                ch = ucs2lib_utf16_decode(&q, e,
6194
17.5k
                        (Py_UCS2*)writer.data, &writer.pos,
6195
17.5k
                        native_ordering);
6196
17.5k
            } else {
6197
14.6k
                assert(kind == PyUnicode_4BYTE_KIND);
6198
14.6k
                ch = ucs4lib_utf16_decode(&q, e,
6199
14.6k
                        (Py_UCS4*)writer.data, &writer.pos,
6200
14.6k
                        native_ordering);
6201
14.6k
            }
6202
52.0k
        }
6203
6204
59.9k
        switch (ch)
6205
59.9k
        {
6206
17.3k
        case 0:
6207
            /* remaining byte at the end? (size should be even) */
6208
17.3k
            if (q == e || consumed)
6209
10.5k
                goto End;
6210
6.80k
            errmsg = "truncated data";
6211
6.80k
            startinpos = ((const char *)q) - starts;
6212
6.80k
            endinpos = ((const char *)e) - starts;
6213
6.80k
            break;
6214
            /* The remaining input chars are ignored if the callback
6215
               chooses to skip the input */
6216
1.73k
        case 1:
6217
1.73k
            q -= 2;
6218
1.73k
            if (consumed)
6219
0
                goto End;
6220
1.73k
            errmsg = "unexpected end of data";
6221
1.73k
            startinpos = ((const char *)q) - starts;
6222
1.73k
            endinpos = ((const char *)e) - starts;
6223
1.73k
            break;
6224
13.5k
        case 2:
6225
13.5k
            errmsg = "illegal encoding";
6226
13.5k
            startinpos = ((const char *)q) - 2 - starts;
6227
13.5k
            endinpos = startinpos + 2;
6228
13.5k
            break;
6229
8.79k
        case 3:
6230
8.79k
            errmsg = "illegal UTF-16 surrogate";
6231
8.79k
            startinpos = ((const char *)q) - 4 - starts;
6232
8.79k
            endinpos = startinpos + 2;
6233
8.79k
            break;
6234
18.4k
        default:
6235
18.4k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6236
0
                goto onError;
6237
18.4k
            continue;
6238
59.9k
        }
6239
6240
30.8k
        if (unicode_decode_call_errorhandler_writer(
6241
30.8k
                errors,
6242
30.8k
                &errorHandler,
6243
30.8k
                encoding, errmsg,
6244
30.8k
                &starts,
6245
30.8k
                (const char **)&e,
6246
30.8k
                &startinpos,
6247
30.8k
                &endinpos,
6248
30.8k
                &exc,
6249
30.8k
                (const char **)&q,
6250
30.8k
                &writer))
6251
5.79k
            goto onError;
6252
30.8k
    }
6253
6254
10.5k
End:
6255
10.5k
    if (consumed)
6256
0
        *consumed = (const char *)q-starts;
6257
6258
10.5k
    Py_XDECREF(errorHandler);
6259
10.5k
    Py_XDECREF(exc);
6260
10.5k
    return _PyUnicodeWriter_Finish(&writer);
6261
6262
5.79k
  onError:
6263
5.79k
    _PyUnicodeWriter_Dealloc(&writer);
6264
5.79k
    Py_XDECREF(errorHandler);
6265
5.79k
    Py_XDECREF(exc);
6266
5.79k
    return NULL;
6267
16.3k
}
6268
6269
PyObject *
6270
_PyUnicode_EncodeUTF16(PyObject *str,
6271
                       const char *errors,
6272
                       int byteorder)
6273
8.26k
{
6274
8.26k
    if (!PyUnicode_Check(str)) {
6275
0
        PyErr_BadArgument();
6276
0
        return NULL;
6277
0
    }
6278
8.26k
    int kind = PyUnicode_KIND(str);
6279
8.26k
    const void *data = PyUnicode_DATA(str);
6280
8.26k
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6281
6282
8.26k
    Py_ssize_t pairs = 0;
6283
8.26k
    if (kind == PyUnicode_4BYTE_KIND) {
6284
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6285
0
        const Py_UCS4 *end = in + len;
6286
0
        while (in < end) {
6287
0
            if (*in++ >= 0x10000) {
6288
0
                pairs++;
6289
0
            }
6290
0
        }
6291
0
    }
6292
8.26k
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6293
0
        return PyErr_NoMemory();
6294
0
    }
6295
8.26k
    Py_ssize_t nsize = len + pairs + (byteorder == 0);
6296
6297
#if PY_BIG_ENDIAN
6298
    int native_ordering = byteorder >= 0;
6299
#else
6300
8.26k
    int native_ordering = byteorder <= 0;
6301
8.26k
#endif
6302
6303
8.26k
    if (kind == PyUnicode_1BYTE_KIND) {
6304
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6305
        // on short strings
6306
8.19k
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6307
8.19k
        if (v == NULL) {
6308
0
            return NULL;
6309
0
        }
6310
6311
        /* output buffer is 2-bytes aligned */
6312
8.19k
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6313
8.19k
        unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v);
6314
8.19k
        if (byteorder == 0) {
6315
0
            *out++ = 0xFEFF;
6316
0
        }
6317
8.19k
        if (len > 0) {
6318
8.19k
            ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6319
8.19k
        }
6320
8.19k
        return v;
6321
8.19k
    }
6322
6323
67
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2);
6324
67
    if (writer == NULL) {
6325
0
        return NULL;
6326
0
    }
6327
6328
    /* output buffer is 2-bytes aligned */
6329
67
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2));
6330
67
    unsigned short *out = PyBytesWriter_GetData(writer);
6331
67
    if (byteorder == 0) {
6332
0
        *out++ = 0xFEFF;
6333
0
    }
6334
67
    if (len == 0) {
6335
0
        return PyBytesWriter_Finish(writer);
6336
0
    }
6337
6338
67
    const char *encoding;
6339
67
    if (byteorder < 0) {
6340
0
        encoding = "utf-16-le";
6341
0
    }
6342
67
    else if (byteorder > 0) {
6343
67
        encoding = "utf-16-be";
6344
67
    }
6345
0
    else {
6346
0
        encoding = "utf-16";
6347
0
    }
6348
6349
67
    PyObject *errorHandler = NULL;
6350
67
    PyObject *exc = NULL;
6351
67
    PyObject *rep = NULL;
6352
6353
67
    for (Py_ssize_t pos = 0; pos < len; ) {
6354
67
        if (kind == PyUnicode_2BYTE_KIND) {
6355
67
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6356
67
                                        &out, native_ordering);
6357
67
        }
6358
0
        else {
6359
0
            assert(kind == PyUnicode_4BYTE_KIND);
6360
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6361
0
                                        &out, native_ordering);
6362
0
        }
6363
67
        if (pos == len)
6364
67
            break;
6365
6366
0
        Py_ssize_t newpos;
6367
0
        rep = unicode_encode_call_errorhandler(
6368
0
                errors, &errorHandler,
6369
0
                encoding, "surrogates not allowed",
6370
0
                str, &exc, pos, pos + 1, &newpos);
6371
0
        if (!rep)
6372
0
            goto error;
6373
6374
0
        Py_ssize_t repsize, moreunits;
6375
0
        if (PyBytes_Check(rep)) {
6376
0
            repsize = PyBytes_GET_SIZE(rep);
6377
0
            if (repsize & 1) {
6378
0
                raise_encode_exception(&exc, encoding,
6379
0
                                       str, pos, pos + 1,
6380
0
                                       "surrogates not allowed");
6381
0
                goto error;
6382
0
            }
6383
0
            moreunits = repsize / 2;
6384
0
        }
6385
0
        else {
6386
0
            assert(PyUnicode_Check(rep));
6387
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6388
0
            if (!PyUnicode_IS_ASCII(rep)) {
6389
0
                raise_encode_exception(&exc, encoding,
6390
0
                                       str, pos, pos + 1,
6391
0
                                       "surrogates not allowed");
6392
0
                goto error;
6393
0
            }
6394
0
        }
6395
0
        moreunits += pos - newpos;
6396
0
        pos = newpos;
6397
6398
        /* two bytes are reserved for each surrogate */
6399
0
        if (moreunits > 0) {
6400
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits, out);
6401
0
            if (out == NULL) {
6402
0
                goto error;
6403
0
            }
6404
0
        }
6405
6406
0
        if (PyBytes_Check(rep)) {
6407
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6408
0
            out += repsize / 2;
6409
0
        } else {
6410
            /* rep is unicode */
6411
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6412
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6413
0
                                 &out, native_ordering);
6414
0
        }
6415
6416
0
        Py_CLEAR(rep);
6417
0
    }
6418
6419
67
    Py_XDECREF(errorHandler);
6420
67
    Py_XDECREF(exc);
6421
6422
    /* Cut back to size actually needed. This is necessary for, for example,
6423
    encoding of a string containing isolated surrogates and the 'ignore' handler
6424
    is used. */
6425
67
    return PyBytesWriter_FinishWithPointer(writer, out);
6426
6427
0
  error:
6428
0
    Py_XDECREF(rep);
6429
0
    Py_XDECREF(errorHandler);
6430
0
    Py_XDECREF(exc);
6431
0
    PyBytesWriter_Discard(writer);
6432
0
    return NULL;
6433
67
}
6434
6435
PyObject *
6436
PyUnicode_AsUTF16String(PyObject *unicode)
6437
0
{
6438
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6439
0
}
6440
6441
_PyUnicode_Name_CAPI *
6442
_PyUnicode_GetNameCAPI(void)
6443
15.9k
{
6444
15.9k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6445
15.9k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6446
6447
15.9k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6448
15.9k
    if (ucnhash_capi == NULL) {
6449
2
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6450
2
                PyUnicodeData_CAPSULE_NAME, 1);
6451
6452
        // It's fine if we overwrite the value here. It's always the same value.
6453
2
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6454
2
    }
6455
15.9k
    return ucnhash_capi;
6456
15.9k
}
6457
6458
/* --- Unicode Escape Codec ----------------------------------------------- */
6459
6460
PyObject *
6461
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6462
                               Py_ssize_t size,
6463
                               const char *errors,
6464
                               Py_ssize_t *consumed,
6465
                               int *first_invalid_escape_char,
6466
                               const char **first_invalid_escape_ptr)
6467
29.0k
{
6468
29.0k
    const char *starts = s;
6469
29.0k
    const char *initial_starts = starts;
6470
29.0k
    _PyUnicodeWriter writer;
6471
29.0k
    const char *end;
6472
29.0k
    PyObject *errorHandler = NULL;
6473
29.0k
    PyObject *exc = NULL;
6474
29.0k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6475
6476
    // so we can remember if we've seen an invalid escape char or not
6477
29.0k
    *first_invalid_escape_char = -1;
6478
29.0k
    *first_invalid_escape_ptr = NULL;
6479
6480
29.0k
    if (size == 0) {
6481
2.79k
        if (consumed) {
6482
0
            *consumed = 0;
6483
0
        }
6484
2.79k
        _Py_RETURN_UNICODE_EMPTY();
6485
2.79k
    }
6486
    /* Escaped strings will always be longer than the resulting
6487
       Unicode string, so we start with size here and then reduce the
6488
       length after conversion to the true value.
6489
       (but if the error callback returns a long replacement string
6490
       we'll have to allocate more space) */
6491
26.2k
    _PyUnicodeWriter_Init(&writer);
6492
26.2k
    writer.min_length = size;
6493
26.2k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6494
0
        goto onError;
6495
0
    }
6496
6497
26.2k
    end = s + size;
6498
8.92M
    while (s < end) {
6499
8.89M
        unsigned char c = (unsigned char) *s++;
6500
8.89M
        Py_UCS4 ch;
6501
8.89M
        int count;
6502
8.89M
        const char *message;
6503
6504
8.89M
#define WRITE_ASCII_CHAR(ch)                                                  \
6505
8.89M
            do {                                                              \
6506
114k
                assert(ch <= 127);                                            \
6507
114k
                assert(writer.pos < writer.size);                             \
6508
114k
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6509
114k
            } while(0)
6510
6511
8.89M
#define WRITE_CHAR(ch)                                                        \
6512
8.89M
            do {                                                              \
6513
8.81M
                if (ch <= writer.maxchar) {                                   \
6514
8.80M
                    assert(writer.pos < writer.size);                         \
6515
8.80M
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6516
8.80M
                }                                                             \
6517
8.81M
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6518
0
                    goto onError;                                             \
6519
0
                }                                                             \
6520
8.81M
            } while(0)
6521
6522
        /* Non-escape characters are interpreted as Unicode ordinals */
6523
8.89M
        if (c != '\\') {
6524
8.61M
            WRITE_CHAR(c);
6525
8.61M
            continue;
6526
8.61M
        }
6527
6528
277k
        Py_ssize_t startinpos = s - starts - 1;
6529
        /* \ - Escapes */
6530
277k
        if (s >= end) {
6531
15
            message = "\\ at end of string";
6532
15
            goto incomplete;
6533
15
        }
6534
277k
        c = (unsigned char) *s++;
6535
6536
277k
        assert(writer.pos < writer.size);
6537
277k
        switch (c) {
6538
6539
            /* \x escapes */
6540
1.59k
        case '\n': continue;
6541
37.4k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6542
1.61k
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6543
3.58k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6544
3.04k
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6545
        /* FF */
6546
6.54k
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6547
1.48k
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6548
2.07k
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6549
6.36k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6550
        /* VT */
6551
14.2k
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6552
        /* BEL, not classic C */
6553
2.42k
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6554
6555
            /* \OOO (octal) escapes */
6556
42.2k
        case '0': case '1': case '2': case '3':
6557
109k
        case '4': case '5': case '6': case '7':
6558
109k
            ch = c - '0';
6559
109k
            if (s < end && '0' <= *s && *s <= '7') {
6560
64.4k
                ch = (ch<<3) + *s++ - '0';
6561
64.4k
                if (s < end && '0' <= *s && *s <= '7') {
6562
50.6k
                    ch = (ch<<3) + *s++ - '0';
6563
50.6k
                }
6564
64.4k
            }
6565
109k
            if (ch > 0377) {
6566
48.6k
                if (*first_invalid_escape_char == -1) {
6567
924
                    *first_invalid_escape_char = ch;
6568
924
                    if (starts == initial_starts) {
6569
                        /* Back up 3 chars, since we've already incremented s. */
6570
924
                        *first_invalid_escape_ptr = s - 3;
6571
924
                    }
6572
924
                }
6573
48.6k
            }
6574
109k
            WRITE_CHAR(ch);
6575
109k
            continue;
6576
6577
            /* hex escapes */
6578
            /* \xXX */
6579
109k
        case 'x':
6580
14.6k
            count = 2;
6581
14.6k
            message = "truncated \\xXX escape";
6582
14.6k
            goto hexescape;
6583
6584
            /* \uXXXX */
6585
6.40k
        case 'u':
6586
6.40k
            count = 4;
6587
6.40k
            message = "truncated \\uXXXX escape";
6588
6.40k
            goto hexescape;
6589
6590
            /* \UXXXXXXXX */
6591
15.6k
        case 'U':
6592
15.6k
            count = 8;
6593
15.6k
            message = "truncated \\UXXXXXXXX escape";
6594
36.7k
        hexescape:
6595
216k
            for (ch = 0; count; ++s, --count) {
6596
179k
                if (s >= end) {
6597
18
                    goto incomplete;
6598
18
                }
6599
179k
                c = (unsigned char)*s;
6600
179k
                ch <<= 4;
6601
179k
                if (c >= '0' && c <= '9') {
6602
125k
                    ch += c - '0';
6603
125k
                }
6604
54.5k
                else if (c >= 'a' && c <= 'f') {
6605
54.1k
                    ch += c - ('a' - 10);
6606
54.1k
                }
6607
392
                else if (c >= 'A' && c <= 'F') {
6608
354
                    ch += c - ('A' - 10);
6609
354
                }
6610
38
                else {
6611
38
                    goto error;
6612
38
                }
6613
179k
            }
6614
6615
            /* when we get here, ch is a 32-bit unicode character */
6616
36.6k
            if (ch > MAX_UNICODE) {
6617
8
                message = "illegal Unicode character";
6618
8
                goto error;
6619
8
            }
6620
6621
36.6k
            WRITE_CHAR(ch);
6622
36.6k
            continue;
6623
6624
            /* \N{name} */
6625
36.6k
        case 'N':
6626
15.9k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6627
15.9k
            if (ucnhash_capi == NULL) {
6628
0
                PyErr_SetString(
6629
0
                        PyExc_UnicodeError,
6630
0
                        "\\N escapes not supported (can't load unicodedata module)"
6631
0
                );
6632
0
                goto onError;
6633
0
            }
6634
6635
15.9k
            message = "malformed \\N character escape";
6636
15.9k
            if (s >= end) {
6637
6
                goto incomplete;
6638
6
            }
6639
15.9k
            if (*s == '{') {
6640
15.9k
                const char *start = ++s;
6641
15.9k
                size_t namelen;
6642
                /* look for the closing brace */
6643
113k
                while (s < end && *s != '}')
6644
97.8k
                    s++;
6645
15.9k
                if (s >= end) {
6646
26
                    goto incomplete;
6647
26
                }
6648
15.9k
                namelen = s - start;
6649
15.9k
                if (namelen) {
6650
                    /* found a name.  look it up in the unicode database */
6651
15.9k
                    s++;
6652
15.9k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6653
15.9k
                    if (namelen <= INT_MAX &&
6654
15.9k
                        ucnhash_capi->getcode(start, (int)namelen,
6655
15.9k
                                              &ch, 0)) {
6656
15.7k
                        assert(ch <= MAX_UNICODE);
6657
15.7k
                        WRITE_CHAR(ch);
6658
15.7k
                        continue;
6659
15.7k
                    }
6660
114
                    message = "unknown Unicode character name";
6661
114
                }
6662
15.9k
            }
6663
135
            goto error;
6664
6665
35.4k
        default:
6666
35.4k
            if (*first_invalid_escape_char == -1) {
6667
3.64k
                *first_invalid_escape_char = c;
6668
3.64k
                if (starts == initial_starts) {
6669
                    /* Back up one char, since we've already incremented s. */
6670
3.64k
                    *first_invalid_escape_ptr = s - 1;
6671
3.64k
                }
6672
3.64k
            }
6673
35.4k
            WRITE_ASCII_CHAR('\\');
6674
35.4k
            WRITE_CHAR(c);
6675
35.4k
            continue;
6676
277k
        }
6677
6678
65
      incomplete:
6679
65
        if (consumed) {
6680
0
            *consumed = startinpos;
6681
0
            break;
6682
0
        }
6683
246
      error:;
6684
246
        Py_ssize_t endinpos = s-starts;
6685
246
        writer.min_length = end - s + writer.pos;
6686
246
        if (unicode_decode_call_errorhandler_writer(
6687
246
                errors, &errorHandler,
6688
246
                "unicodeescape", message,
6689
246
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6690
246
                &writer)) {
6691
246
            goto onError;
6692
246
        }
6693
246
        assert(end - s <= writer.size - writer.pos);
6694
6695
0
#undef WRITE_ASCII_CHAR
6696
0
#undef WRITE_CHAR
6697
0
    }
6698
6699
25.9k
    Py_XDECREF(errorHandler);
6700
25.9k
    Py_XDECREF(exc);
6701
25.9k
    return _PyUnicodeWriter_Finish(&writer);
6702
6703
246
  onError:
6704
246
    _PyUnicodeWriter_Dealloc(&writer);
6705
246
    Py_XDECREF(errorHandler);
6706
246
    Py_XDECREF(exc);
6707
246
    return NULL;
6708
26.2k
}
6709
6710
PyObject *
6711
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6712
                              Py_ssize_t size,
6713
                              const char *errors,
6714
                              Py_ssize_t *consumed)
6715
545
{
6716
545
    int first_invalid_escape_char;
6717
545
    const char *first_invalid_escape_ptr;
6718
545
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6719
545
                                                      consumed,
6720
545
                                                      &first_invalid_escape_char,
6721
545
                                                      &first_invalid_escape_ptr);
6722
545
    if (result == NULL)
6723
122
        return NULL;
6724
423
    if (first_invalid_escape_char != -1) {
6725
303
        if (first_invalid_escape_char > 0xff) {
6726
96
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6727
96
                                 "\"\\%o\" is an invalid octal escape sequence. "
6728
96
                                 "Such sequences will not work in the future. ",
6729
96
                                 first_invalid_escape_char) < 0)
6730
0
            {
6731
0
                Py_DECREF(result);
6732
0
                return NULL;
6733
0
            }
6734
96
        }
6735
207
        else {
6736
207
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6737
207
                                 "\"\\%c\" is an invalid escape sequence. "
6738
207
                                 "Such sequences will not work in the future. ",
6739
207
                                 first_invalid_escape_char) < 0)
6740
0
            {
6741
0
                Py_DECREF(result);
6742
0
                return NULL;
6743
0
            }
6744
207
        }
6745
303
    }
6746
423
    return result;
6747
423
}
6748
6749
PyObject *
6750
PyUnicode_DecodeUnicodeEscape(const char *s,
6751
                              Py_ssize_t size,
6752
                              const char *errors)
6753
0
{
6754
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6755
0
}
6756
6757
/* Return a Unicode-Escape string version of the Unicode object. */
6758
6759
PyObject *
6760
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6761
317k
{
6762
317k
    if (!PyUnicode_Check(unicode)) {
6763
0
        PyErr_BadArgument();
6764
0
        return NULL;
6765
0
    }
6766
6767
317k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
6768
317k
    if (len == 0) {
6769
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
6770
0
    }
6771
317k
    int kind = PyUnicode_KIND(unicode);
6772
317k
    const void *data = PyUnicode_DATA(unicode);
6773
6774
    /* Initial allocation is based on the longest-possible character
6775
     * escape.
6776
     *
6777
     * For UCS1 strings it's '\xxx', 4 bytes per source character.
6778
     * For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6779
     * For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. */
6780
317k
    Py_ssize_t expandsize = kind * 2 + 2;
6781
317k
    if (len > PY_SSIZE_T_MAX / expandsize) {
6782
0
        return PyErr_NoMemory();
6783
0
    }
6784
6785
317k
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
6786
317k
    if (writer == NULL) {
6787
0
        return NULL;
6788
0
    }
6789
317k
    char *p = PyBytesWriter_GetData(writer);
6790
6791
634k
    for (Py_ssize_t i = 0; i < len; i++) {
6792
317k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6793
6794
        /* U+0000-U+00ff range */
6795
317k
        if (ch < 0x100) {
6796
309k
            if (ch >= ' ' && ch < 127) {
6797
17.7k
                if (ch != '\\') {
6798
                    /* Copy printable US ASCII as-is */
6799
0
                    *p++ = (char) ch;
6800
0
                }
6801
                /* Escape backslashes */
6802
17.7k
                else {
6803
17.7k
                    *p++ = '\\';
6804
17.7k
                    *p++ = '\\';
6805
17.7k
                }
6806
17.7k
            }
6807
6808
            /* Map special whitespace to '\t', \n', '\r' */
6809
292k
            else if (ch == '\t') {
6810
5.44k
                *p++ = '\\';
6811
5.44k
                *p++ = 't';
6812
5.44k
            }
6813
286k
            else if (ch == '\n') {
6814
2.53k
                *p++ = '\\';
6815
2.53k
                *p++ = 'n';
6816
2.53k
            }
6817
284k
            else if (ch == '\r') {
6818
588
                *p++ = '\\';
6819
588
                *p++ = 'r';
6820
588
            }
6821
6822
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6823
283k
            else {
6824
283k
                *p++ = '\\';
6825
283k
                *p++ = 'x';
6826
283k
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6827
283k
                *p++ = Py_hexdigits[ch & 0x000F];
6828
283k
            }
6829
309k
        }
6830
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6831
7.16k
        else if (ch < 0x10000) {
6832
5.96k
            *p++ = '\\';
6833
5.96k
            *p++ = 'u';
6834
5.96k
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6835
5.96k
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6836
5.96k
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6837
5.96k
            *p++ = Py_hexdigits[ch & 0x000F];
6838
5.96k
        }
6839
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6840
1.20k
        else {
6841
6842
            /* Make sure that the first two digits are zero */
6843
1.20k
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6844
1.20k
            *p++ = '\\';
6845
1.20k
            *p++ = 'U';
6846
1.20k
            *p++ = '0';
6847
1.20k
            *p++ = '0';
6848
1.20k
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6849
1.20k
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6850
1.20k
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6851
1.20k
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6852
1.20k
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6853
1.20k
            *p++ = Py_hexdigits[ch & 0x0000000F];
6854
1.20k
        }
6855
317k
    }
6856
6857
317k
    return PyBytesWriter_FinishWithPointer(writer, p);
6858
317k
}
6859
6860
/* --- Raw Unicode Escape Codec ------------------------------------------- */
6861
6862
PyObject *
6863
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6864
                                          Py_ssize_t size,
6865
                                          const char *errors,
6866
                                          Py_ssize_t *consumed)
6867
112
{
6868
112
    const char *starts = s;
6869
112
    _PyUnicodeWriter writer;
6870
112
    const char *end;
6871
112
    PyObject *errorHandler = NULL;
6872
112
    PyObject *exc = NULL;
6873
6874
112
    if (size == 0) {
6875
0
        if (consumed) {
6876
0
            *consumed = 0;
6877
0
        }
6878
0
        _Py_RETURN_UNICODE_EMPTY();
6879
0
    }
6880
6881
    /* Escaped strings will always be longer than the resulting
6882
       Unicode string, so we start with size here and then reduce the
6883
       length after conversion to the true value. (But decoding error
6884
       handler might have to resize the string) */
6885
112
    _PyUnicodeWriter_Init(&writer);
6886
112
    writer.min_length = size;
6887
112
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6888
0
        goto onError;
6889
0
    }
6890
6891
112
    end = s + size;
6892
46.7k
    while (s < end) {
6893
46.6k
        unsigned char c = (unsigned char) *s++;
6894
46.6k
        Py_UCS4 ch;
6895
46.6k
        int count;
6896
46.6k
        const char *message;
6897
6898
46.6k
#define WRITE_CHAR(ch)                                                        \
6899
46.6k
            do {                                                              \
6900
46.6k
                if (ch <= writer.maxchar) {                                   \
6901
46.5k
                    assert(writer.pos < writer.size);                         \
6902
46.5k
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6903
46.5k
                }                                                             \
6904
46.6k
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6905
0
                    goto onError;                                             \
6906
0
                }                                                             \
6907
46.6k
            } while(0)
6908
6909
        /* Non-escape characters are interpreted as Unicode ordinals */
6910
46.6k
        if (c != '\\' || (s >= end && !consumed)) {
6911
43.5k
            WRITE_CHAR(c);
6912
43.5k
            continue;
6913
43.5k
        }
6914
6915
3.07k
        Py_ssize_t startinpos = s - starts - 1;
6916
        /* \ - Escapes */
6917
3.07k
        if (s >= end) {
6918
0
            assert(consumed);
6919
            // Set message to silent compiler warning.
6920
            // Actually it is never used.
6921
0
            message = "\\ at end of string";
6922
0
            goto incomplete;
6923
0
        }
6924
6925
3.07k
        c = (unsigned char) *s++;
6926
3.07k
        if (c == 'u') {
6927
404
            count = 4;
6928
404
            message = "truncated \\uXXXX escape";
6929
404
        }
6930
2.66k
        else if (c == 'U') {
6931
542
            count = 8;
6932
542
            message = "truncated \\UXXXXXXXX escape";
6933
542
        }
6934
2.12k
        else {
6935
2.12k
            assert(writer.pos < writer.size);
6936
2.12k
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6937
2.12k
            WRITE_CHAR(c);
6938
2.12k
            continue;
6939
2.12k
        }
6940
6941
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6942
6.78k
        for (ch = 0; count; ++s, --count) {
6943
5.86k
            if (s >= end) {
6944
7
                goto incomplete;
6945
7
            }
6946
5.85k
            c = (unsigned char)*s;
6947
5.85k
            ch <<= 4;
6948
5.85k
            if (c >= '0' && c <= '9') {
6949
5.00k
                ch += c - '0';
6950
5.00k
            }
6951
850
            else if (c >= 'a' && c <= 'f') {
6952
736
                ch += c - ('a' - 10);
6953
736
            }
6954
114
            else if (c >= 'A' && c <= 'F') {
6955
95
                ch += c - ('A' - 10);
6956
95
            }
6957
19
            else {
6958
19
                goto error;
6959
19
            }
6960
5.85k
        }
6961
920
        if (ch > MAX_UNICODE) {
6962
3
            message = "\\Uxxxxxxxx out of range";
6963
3
            goto error;
6964
3
        }
6965
917
        WRITE_CHAR(ch);
6966
917
        continue;
6967
6968
917
      incomplete:
6969
7
        if (consumed) {
6970
0
            *consumed = startinpos;
6971
0
            break;
6972
0
        }
6973
29
      error:;
6974
29
        Py_ssize_t endinpos = s-starts;
6975
29
        writer.min_length = end - s + writer.pos;
6976
29
        if (unicode_decode_call_errorhandler_writer(
6977
29
                errors, &errorHandler,
6978
29
                "rawunicodeescape", message,
6979
29
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6980
29
                &writer)) {
6981
29
            goto onError;
6982
29
        }
6983
29
        assert(end - s <= writer.size - writer.pos);
6984
6985
0
#undef WRITE_CHAR
6986
0
    }
6987
83
    Py_XDECREF(errorHandler);
6988
83
    Py_XDECREF(exc);
6989
83
    return _PyUnicodeWriter_Finish(&writer);
6990
6991
29
  onError:
6992
29
    _PyUnicodeWriter_Dealloc(&writer);
6993
29
    Py_XDECREF(errorHandler);
6994
29
    Py_XDECREF(exc);
6995
29
    return NULL;
6996
112
}
6997
6998
PyObject *
6999
PyUnicode_DecodeRawUnicodeEscape(const char *s,
7000
                                 Py_ssize_t size,
7001
                                 const char *errors)
7002
0
{
7003
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
7004
0
}
7005
7006
7007
PyObject *
7008
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7009
247k
{
7010
247k
    if (!PyUnicode_Check(unicode)) {
7011
0
        PyErr_BadArgument();
7012
0
        return NULL;
7013
0
    }
7014
247k
    int kind = PyUnicode_KIND(unicode);
7015
247k
    const void *data = PyUnicode_DATA(unicode);
7016
247k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
7017
247k
    if (len == 0) {
7018
443
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7019
443
    }
7020
247k
    if (kind == PyUnicode_1BYTE_KIND) {
7021
247k
        return PyBytes_FromStringAndSize(data, len);
7022
247k
    }
7023
7024
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7025
       bytes, and 1 byte characters 4. */
7026
311
    Py_ssize_t expandsize = kind * 2 + 2;
7027
311
    if (len > PY_SSIZE_T_MAX / expandsize) {
7028
0
        return PyErr_NoMemory();
7029
0
    }
7030
7031
311
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
7032
311
    if (writer == NULL) {
7033
0
        return NULL;
7034
0
    }
7035
311
    char *p = PyBytesWriter_GetData(writer);
7036
7037
5.27M
    for (Py_ssize_t pos = 0; pos < len; pos++) {
7038
5.27M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7039
7040
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7041
5.27M
        if (ch < 0x100) {
7042
5.22M
            *p++ = (char) ch;
7043
5.22M
        }
7044
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7045
45.9k
        else if (ch < 0x10000) {
7046
45.0k
            *p++ = '\\';
7047
45.0k
            *p++ = 'u';
7048
45.0k
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7049
45.0k
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7050
45.0k
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7051
45.0k
            *p++ = Py_hexdigits[ch & 15];
7052
45.0k
        }
7053
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7054
858
        else {
7055
858
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7056
858
            *p++ = '\\';
7057
858
            *p++ = 'U';
7058
858
            *p++ = '0';
7059
858
            *p++ = '0';
7060
858
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7061
858
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7062
858
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7063
858
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7064
858
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7065
858
            *p++ = Py_hexdigits[ch & 15];
7066
858
        }
7067
5.27M
    }
7068
7069
311
    return PyBytesWriter_FinishWithPointer(writer, p);
7070
311
}
7071
7072
/* --- Latin-1 Codec ------------------------------------------------------ */
7073
7074
PyObject *
7075
PyUnicode_DecodeLatin1(const char *s,
7076
                       Py_ssize_t size,
7077
                       const char *errors)
7078
2.93M
{
7079
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7080
2.93M
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7081
2.93M
}
7082
7083
/* create or adjust a UnicodeEncodeError */
7084
static void
7085
make_encode_exception(PyObject **exceptionObject,
7086
                      const char *encoding,
7087
                      PyObject *unicode,
7088
                      Py_ssize_t startpos, Py_ssize_t endpos,
7089
                      const char *reason)
7090
739k
{
7091
739k
    if (*exceptionObject == NULL) {
7092
739k
        *exceptionObject = PyObject_CallFunction(
7093
739k
            PyExc_UnicodeEncodeError, "sOnns",
7094
739k
            encoding, unicode, startpos, endpos, reason);
7095
739k
    }
7096
0
    else {
7097
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7098
0
            goto onError;
7099
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7100
0
            goto onError;
7101
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7102
0
            goto onError;
7103
0
        return;
7104
0
      onError:
7105
0
        Py_CLEAR(*exceptionObject);
7106
0
    }
7107
739k
}
7108
7109
/* raises a UnicodeEncodeError */
7110
static void
7111
raise_encode_exception(PyObject **exceptionObject,
7112
                       const char *encoding,
7113
                       PyObject *unicode,
7114
                       Py_ssize_t startpos, Py_ssize_t endpos,
7115
                       const char *reason)
7116
576k
{
7117
576k
    make_encode_exception(exceptionObject,
7118
576k
                          encoding, unicode, startpos, endpos, reason);
7119
576k
    if (*exceptionObject != NULL)
7120
576k
        PyCodec_StrictErrors(*exceptionObject);
7121
576k
}
7122
7123
/* error handling callback helper:
7124
   build arguments, call the callback and check the arguments,
7125
   put the result into newpos and return the replacement string, which
7126
   has to be freed by the caller */
7127
static PyObject *
7128
unicode_encode_call_errorhandler(const char *errors,
7129
                                 PyObject **errorHandler,
7130
                                 const char *encoding, const char *reason,
7131
                                 PyObject *unicode, PyObject **exceptionObject,
7132
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7133
                                 Py_ssize_t *newpos)
7134
162k
{
7135
162k
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7136
162k
    Py_ssize_t len;
7137
162k
    PyObject *restuple;
7138
162k
    PyObject *resunicode;
7139
7140
162k
    if (*errorHandler == NULL) {
7141
162k
        *errorHandler = PyCodec_LookupError(errors);
7142
162k
        if (*errorHandler == NULL)
7143
0
            return NULL;
7144
162k
    }
7145
7146
162k
    len = PyUnicode_GET_LENGTH(unicode);
7147
7148
162k
    make_encode_exception(exceptionObject,
7149
162k
                          encoding, unicode, startpos, endpos, reason);
7150
162k
    if (*exceptionObject == NULL)
7151
0
        return NULL;
7152
7153
162k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7154
162k
    if (restuple == NULL)
7155
162k
        return NULL;
7156
0
    if (!PyTuple_Check(restuple)) {
7157
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7158
0
        Py_DECREF(restuple);
7159
0
        return NULL;
7160
0
    }
7161
0
    if (!PyArg_ParseTuple(restuple, argparse,
7162
0
                          &resunicode, newpos)) {
7163
0
        Py_DECREF(restuple);
7164
0
        return NULL;
7165
0
    }
7166
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7167
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7168
0
        Py_DECREF(restuple);
7169
0
        return NULL;
7170
0
    }
7171
0
    if (*newpos<0)
7172
0
        *newpos = len + *newpos;
7173
0
    if (*newpos<0 || *newpos>len) {
7174
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7175
0
        Py_DECREF(restuple);
7176
0
        return NULL;
7177
0
    }
7178
0
    Py_INCREF(resunicode);
7179
0
    Py_DECREF(restuple);
7180
0
    return resunicode;
7181
0
}
7182
7183
static PyObject *
7184
unicode_encode_ucs1(PyObject *unicode,
7185
                    const char *errors,
7186
                    const Py_UCS4 limit)
7187
594k
{
7188
    /* input state */
7189
594k
    Py_ssize_t pos=0, size;
7190
594k
    int kind;
7191
594k
    const void *data;
7192
594k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7193
594k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7194
594k
    PyObject *error_handler_obj = NULL;
7195
594k
    PyObject *exc = NULL;
7196
594k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7197
594k
    PyObject *rep = NULL;
7198
7199
594k
    size = PyUnicode_GET_LENGTH(unicode);
7200
594k
    kind = PyUnicode_KIND(unicode);
7201
594k
    data = PyUnicode_DATA(unicode);
7202
    /* allocate enough for a simple encoding without
7203
       replacements, if we need more, we'll resize */
7204
594k
    if (size == 0)
7205
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7206
7207
    /* output object */
7208
594k
    PyBytesWriter *writer = PyBytesWriter_Create(size);
7209
594k
    if (writer == NULL) {
7210
0
        return NULL;
7211
0
    }
7212
    /* pointer into the output */
7213
594k
    char *str = PyBytesWriter_GetData(writer);
7214
7215
4.40M
    while (pos < size) {
7216
4.40M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7217
7218
        /* can we encode this? */
7219
4.40M
        if (ch < limit) {
7220
            /* no overflow check, because we know that the space is enough */
7221
3.80M
            *str++ = (char)ch;
7222
3.80M
            ++pos;
7223
3.80M
        }
7224
594k
        else {
7225
594k
            Py_ssize_t newpos, i;
7226
            /* startpos for collecting unencodable chars */
7227
594k
            Py_ssize_t collstart = pos;
7228
594k
            Py_ssize_t collend = collstart + 1;
7229
            /* find all unecodable characters */
7230
7231
2.47M
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7232
1.88M
                ++collend;
7233
7234
            /* Only overallocate the buffer if it's not the last write */
7235
594k
            writer->overallocate = (collend < size);
7236
7237
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7238
594k
            if (error_handler == _Py_ERROR_UNKNOWN)
7239
594k
                error_handler = _Py_GetErrorHandler(errors);
7240
7241
594k
            switch (error_handler) {
7242
576k
            case _Py_ERROR_STRICT:
7243
576k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7244
576k
                goto onError;
7245
7246
6.46k
            case _Py_ERROR_REPLACE:
7247
6.46k
                memset(str, '?', collend - collstart);
7248
6.46k
                str += (collend - collstart);
7249
6.46k
                _Py_FALLTHROUGH;
7250
6.46k
            case _Py_ERROR_IGNORE:
7251
6.46k
                pos = collend;
7252
6.46k
                break;
7253
7254
0
            case _Py_ERROR_BACKSLASHREPLACE:
7255
                /* subtract preallocated bytes */
7256
0
                writer->size -= (collend - collstart);
7257
0
                str = backslashreplace(writer, str,
7258
0
                                       unicode, collstart, collend);
7259
0
                if (str == NULL)
7260
0
                    goto onError;
7261
0
                pos = collend;
7262
0
                break;
7263
7264
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7265
                /* subtract preallocated bytes */
7266
0
                writer->size -= (collend - collstart);
7267
0
                str = xmlcharrefreplace(writer, str,
7268
0
                                        unicode, collstart, collend);
7269
0
                if (str == NULL)
7270
0
                    goto onError;
7271
0
                pos = collend;
7272
0
                break;
7273
7274
11.6k
            case _Py_ERROR_SURROGATEESCAPE:
7275
11.6k
                for (i = collstart; i < collend; ++i) {
7276
11.6k
                    ch = PyUnicode_READ(kind, data, i);
7277
11.6k
                    if (ch < 0xdc80 || 0xdcff < ch) {
7278
                        /* Not a UTF-8b surrogate */
7279
11.6k
                        break;
7280
11.6k
                    }
7281
0
                    *str++ = (char)(ch - 0xdc00);
7282
0
                    ++pos;
7283
0
                }
7284
11.6k
                if (i >= collend)
7285
0
                    break;
7286
11.6k
                collstart = pos;
7287
11.6k
                assert(collstart != collend);
7288
11.6k
                _Py_FALLTHROUGH;
7289
7290
11.6k
            default:
7291
11.6k
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7292
11.6k
                                                       encoding, reason, unicode, &exc,
7293
11.6k
                                                       collstart, collend, &newpos);
7294
11.6k
                if (rep == NULL)
7295
11.6k
                    goto onError;
7296
7297
0
                if (newpos < collstart) {
7298
0
                    writer->overallocate = 1;
7299
0
                    str = PyBytesWriter_GrowAndUpdatePointer(writer,
7300
0
                                                             collstart - newpos,
7301
0
                                                             str);
7302
0
                    if (str == NULL) {
7303
0
                        goto onError;
7304
0
                    }
7305
0
                }
7306
0
                else {
7307
                    /* subtract preallocated bytes */
7308
0
                    writer->size -= newpos - collstart;
7309
                    /* Only overallocate the buffer if it's not the last write */
7310
0
                    writer->overallocate = (newpos < size);
7311
0
                }
7312
7313
0
                char *rep_str;
7314
0
                Py_ssize_t rep_len;
7315
0
                if (PyBytes_Check(rep)) {
7316
                    /* Directly copy bytes result to output. */
7317
0
                    rep_str = PyBytes_AS_STRING(rep);
7318
0
                    rep_len = PyBytes_GET_SIZE(rep);
7319
0
                }
7320
0
                else {
7321
0
                    assert(PyUnicode_Check(rep));
7322
7323
0
                    if (limit == 256 ?
7324
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7325
0
                        !PyUnicode_IS_ASCII(rep))
7326
0
                    {
7327
                        /* Not all characters are smaller than limit */
7328
0
                        raise_encode_exception(&exc, encoding, unicode,
7329
0
                                               collstart, collend, reason);
7330
0
                        goto onError;
7331
0
                    }
7332
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7333
0
                    rep_str = PyUnicode_DATA(rep);
7334
0
                    rep_len = PyUnicode_GET_LENGTH(rep);
7335
0
                }
7336
7337
0
                str = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, str);
7338
0
                if (str == NULL) {
7339
0
                    goto onError;
7340
0
                }
7341
0
                memcpy(str, rep_str, rep_len);
7342
0
                str += rep_len;
7343
7344
0
                pos = newpos;
7345
0
                Py_CLEAR(rep);
7346
594k
            }
7347
7348
            /* If overallocation was disabled, ensure that it was the last
7349
               write. Otherwise, we missed an optimization */
7350
594k
            assert(writer->overallocate || pos == size);
7351
6.46k
        }
7352
4.40M
    }
7353
7354
6.33k
    Py_XDECREF(error_handler_obj);
7355
6.33k
    Py_XDECREF(exc);
7356
6.33k
    return PyBytesWriter_FinishWithPointer(writer, str);
7357
7358
588k
  onError:
7359
588k
    Py_XDECREF(rep);
7360
588k
    PyBytesWriter_Discard(writer);
7361
588k
    Py_XDECREF(error_handler_obj);
7362
588k
    Py_XDECREF(exc);
7363
588k
    return NULL;
7364
594k
}
7365
7366
PyObject *
7367
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7368
10
{
7369
10
    if (!PyUnicode_Check(unicode)) {
7370
0
        PyErr_BadArgument();
7371
0
        return NULL;
7372
0
    }
7373
    /* Fast path: if it is a one-byte string, construct
7374
       bytes object directly. */
7375
10
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7376
10
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7377
10
                                         PyUnicode_GET_LENGTH(unicode));
7378
    /* Non-Latin-1 characters present. Defer to above function to
7379
       raise the exception. */
7380
0
    return unicode_encode_ucs1(unicode, errors, 256);
7381
10
}
7382
7383
PyObject*
7384
PyUnicode_AsLatin1String(PyObject *unicode)
7385
0
{
7386
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7387
0
}
7388
7389
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7390
7391
PyObject *
7392
PyUnicode_DecodeASCII(const char *s,
7393
                      Py_ssize_t size,
7394
                      const char *errors)
7395
13.2M
{
7396
13.2M
    const char *starts = s;
7397
13.2M
    const char *e = s + size;
7398
13.2M
    PyObject *error_handler_obj = NULL;
7399
13.2M
    PyObject *exc = NULL;
7400
13.2M
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7401
7402
13.2M
    if (size == 0)
7403
0
        _Py_RETURN_UNICODE_EMPTY();
7404
7405
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7406
13.2M
    if (size == 1 && (unsigned char)s[0] < 128) {
7407
523k
        return get_latin1_char((unsigned char)s[0]);
7408
523k
    }
7409
7410
    // Shortcut for simple case
7411
12.7M
    PyObject *u = PyUnicode_New(size, 127);
7412
12.7M
    if (u == NULL) {
7413
0
        return NULL;
7414
0
    }
7415
12.7M
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7416
12.7M
    if (outpos == size) {
7417
9.55M
        return u;
7418
9.55M
    }
7419
7420
3.17M
    _PyUnicodeWriter writer;
7421
3.17M
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7422
3.17M
    writer.pos = outpos;
7423
7424
3.17M
    s += outpos;
7425
3.17M
    int kind = writer.kind;
7426
3.17M
    void *data = writer.data;
7427
3.17M
    Py_ssize_t startinpos, endinpos;
7428
7429
24.1M
    while (s < e) {
7430
23.9M
        unsigned char c = (unsigned char)*s;
7431
23.9M
        if (c < 128) {
7432
6.32M
            PyUnicode_WRITE(kind, data, writer.pos, c);
7433
6.32M
            writer.pos++;
7434
6.32M
            ++s;
7435
6.32M
            continue;
7436
6.32M
        }
7437
7438
        /* byte outsize range 0x00..0x7f: call the error handler */
7439
7440
17.6M
        if (error_handler == _Py_ERROR_UNKNOWN)
7441
3.17M
            error_handler = _Py_GetErrorHandler(errors);
7442
7443
17.6M
        switch (error_handler)
7444
17.6M
        {
7445
800k
        case _Py_ERROR_REPLACE:
7446
14.6M
        case _Py_ERROR_SURROGATEESCAPE:
7447
            /* Fast-path: the error handler only writes one character,
7448
               but we may switch to UCS2 at the first write */
7449
14.6M
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7450
0
                goto onError;
7451
14.6M
            kind = writer.kind;
7452
14.6M
            data = writer.data;
7453
7454
14.6M
            if (error_handler == _Py_ERROR_REPLACE)
7455
800k
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7456
13.8M
            else
7457
13.8M
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7458
14.6M
            writer.pos++;
7459
14.6M
            ++s;
7460
14.6M
            break;
7461
7462
0
        case _Py_ERROR_IGNORE:
7463
0
            ++s;
7464
0
            break;
7465
7466
3.01M
        default:
7467
3.01M
            startinpos = s-starts;
7468
3.01M
            endinpos = startinpos + 1;
7469
3.01M
            if (unicode_decode_call_errorhandler_writer(
7470
3.01M
                    errors, &error_handler_obj,
7471
3.01M
                    "ascii", "ordinal not in range(128)",
7472
3.01M
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7473
3.01M
                    &writer))
7474
3.01M
                goto onError;
7475
0
            kind = writer.kind;
7476
0
            data = writer.data;
7477
17.6M
        }
7478
17.6M
    }
7479
157k
    Py_XDECREF(error_handler_obj);
7480
157k
    Py_XDECREF(exc);
7481
157k
    return _PyUnicodeWriter_Finish(&writer);
7482
7483
3.01M
  onError:
7484
3.01M
    _PyUnicodeWriter_Dealloc(&writer);
7485
3.01M
    Py_XDECREF(error_handler_obj);
7486
3.01M
    Py_XDECREF(exc);
7487
3.01M
    return NULL;
7488
3.17M
}
7489
7490
PyObject *
7491
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7492
980k
{
7493
980k
    if (!PyUnicode_Check(unicode)) {
7494
0
        PyErr_BadArgument();
7495
0
        return NULL;
7496
0
    }
7497
    /* Fast path: if it is an ASCII-only string, construct bytes object
7498
       directly. Else defer to above function to raise the exception. */
7499
980k
    if (PyUnicode_IS_ASCII(unicode))
7500
385k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7501
385k
                                         PyUnicode_GET_LENGTH(unicode));
7502
594k
    return unicode_encode_ucs1(unicode, errors, 128);
7503
980k
}
7504
7505
PyObject *
7506
PyUnicode_AsASCIIString(PyObject *unicode)
7507
124k
{
7508
124k
    return _PyUnicode_AsASCIIString(unicode, NULL);
7509
124k
}
7510
7511
#ifdef MS_WINDOWS
7512
7513
/* --- MBCS codecs for Windows -------------------------------------------- */
7514
7515
#if SIZEOF_INT < SIZEOF_SIZE_T
7516
#define NEED_RETRY
7517
#endif
7518
7519
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7520
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7521
   both cases also and avoids partial characters overrunning the
7522
   length limit in MultiByteToWideChar on Windows */
7523
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7524
7525
#ifndef WC_ERR_INVALID_CHARS
7526
#  define WC_ERR_INVALID_CHARS 0x0080
7527
#endif
7528
7529
static const char*
7530
code_page_name(UINT code_page, PyObject **obj)
7531
{
7532
    *obj = NULL;
7533
    if (code_page == CP_ACP)
7534
        return "mbcs";
7535
7536
    *obj = PyBytes_FromFormat("cp%u", code_page);
7537
    if (*obj == NULL)
7538
        return NULL;
7539
    return PyBytes_AS_STRING(*obj);
7540
}
7541
7542
static DWORD
7543
decode_code_page_flags(UINT code_page)
7544
{
7545
    if (code_page == CP_UTF7) {
7546
        /* The CP_UTF7 decoder only supports flags=0 */
7547
        return 0;
7548
    }
7549
    else
7550
        return MB_ERR_INVALID_CHARS;
7551
}
7552
7553
/*
7554
 * Decode a byte string from a Windows code page into unicode object in strict
7555
 * mode.
7556
 *
7557
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7558
 * OSError and returns -1 on other error.
7559
 */
7560
static int
7561
decode_code_page_strict(UINT code_page,
7562
                        wchar_t **buf,
7563
                        Py_ssize_t *bufsize,
7564
                        const char *in,
7565
                        int insize)
7566
{
7567
    DWORD flags = MB_ERR_INVALID_CHARS;
7568
    wchar_t *out;
7569
    DWORD outsize;
7570
7571
    /* First get the size of the result */
7572
    assert(insize > 0);
7573
    while ((outsize = MultiByteToWideChar(code_page, flags,
7574
                                          in, insize, NULL, 0)) <= 0)
7575
    {
7576
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7577
            goto error;
7578
        }
7579
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7580
        flags = 0;
7581
    }
7582
7583
    /* Extend a wchar_t* buffer */
7584
    Py_ssize_t n = *bufsize;   /* Get the current length */
7585
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7586
        return -1;
7587
    }
7588
    out = *buf + n;
7589
7590
    /* Do the conversion */
7591
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7592
    if (outsize <= 0)
7593
        goto error;
7594
    return insize;
7595
7596
error:
7597
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7598
        return -2;
7599
    PyErr_SetFromWindowsErr(0);
7600
    return -1;
7601
}
7602
7603
/*
7604
 * Decode a byte string from a code page into unicode object with an error
7605
 * handler.
7606
 *
7607
 * Returns consumed size if succeed, or raise an OSError or
7608
 * UnicodeDecodeError exception and returns -1 on error.
7609
 */
7610
static int
7611
decode_code_page_errors(UINT code_page,
7612
                        wchar_t **buf,
7613
                        Py_ssize_t *bufsize,
7614
                        const char *in, const int size,
7615
                        const char *errors, int final)
7616
{
7617
    const char *startin = in;
7618
    const char *endin = in + size;
7619
    DWORD flags = MB_ERR_INVALID_CHARS;
7620
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7621
       2000 English version of the message. */
7622
    const char *reason = "No mapping for the Unicode character exists "
7623
                         "in the target code page.";
7624
    /* each step cannot decode more than 1 character, but a character can be
7625
       represented as a surrogate pair */
7626
    wchar_t buffer[2], *out;
7627
    int insize;
7628
    Py_ssize_t outsize;
7629
    PyObject *errorHandler = NULL;
7630
    PyObject *exc = NULL;
7631
    PyObject *encoding_obj = NULL;
7632
    const char *encoding;
7633
    DWORD err;
7634
    int ret = -1;
7635
7636
    assert(size > 0);
7637
7638
    encoding = code_page_name(code_page, &encoding_obj);
7639
    if (encoding == NULL)
7640
        return -1;
7641
7642
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7643
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7644
           UnicodeDecodeError. */
7645
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7646
        if (exc != NULL) {
7647
            PyCodec_StrictErrors(exc);
7648
            Py_CLEAR(exc);
7649
        }
7650
        goto error;
7651
    }
7652
7653
    /* Extend a wchar_t* buffer */
7654
    Py_ssize_t n = *bufsize;   /* Get the current length */
7655
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7656
        PyErr_NoMemory();
7657
        goto error;
7658
    }
7659
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7660
        goto error;
7661
    }
7662
    out = *buf + n;
7663
7664
    /* Decode the byte string character per character */
7665
    while (in < endin)
7666
    {
7667
        /* Decode a character */
7668
        insize = 1;
7669
        do
7670
        {
7671
            outsize = MultiByteToWideChar(code_page, flags,
7672
                                          in, insize,
7673
                                          buffer, Py_ARRAY_LENGTH(buffer));
7674
            if (outsize > 0)
7675
                break;
7676
            err = GetLastError();
7677
            if (err == ERROR_INVALID_FLAGS && flags) {
7678
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7679
                flags = 0;
7680
                continue;
7681
            }
7682
            if (err != ERROR_NO_UNICODE_TRANSLATION
7683
                && err != ERROR_INSUFFICIENT_BUFFER)
7684
            {
7685
                PyErr_SetFromWindowsErr(err);
7686
                goto error;
7687
            }
7688
            insize++;
7689
        }
7690
        /* 4=maximum length of a UTF-8 sequence */
7691
        while (insize <= 4 && (in + insize) <= endin);
7692
7693
        if (outsize <= 0) {
7694
            Py_ssize_t startinpos, endinpos, outpos;
7695
7696
            /* last character in partial decode? */
7697
            if (in + insize >= endin && !final)
7698
                break;
7699
7700
            startinpos = in - startin;
7701
            endinpos = startinpos + 1;
7702
            outpos = out - *buf;
7703
            if (unicode_decode_call_errorhandler_wchar(
7704
                    errors, &errorHandler,
7705
                    encoding, reason,
7706
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7707
                    buf, bufsize, &outpos))
7708
            {
7709
                goto error;
7710
            }
7711
            out = *buf + outpos;
7712
        }
7713
        else {
7714
            in += insize;
7715
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7716
            out += outsize;
7717
        }
7718
    }
7719
7720
    /* Shrink the buffer */
7721
    assert(out - *buf <= *bufsize);
7722
    *bufsize = out - *buf;
7723
    /* (in - startin) <= size and size is an int */
7724
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7725
7726
error:
7727
    Py_XDECREF(encoding_obj);
7728
    Py_XDECREF(errorHandler);
7729
    Py_XDECREF(exc);
7730
    return ret;
7731
}
7732
7733
static PyObject *
7734
decode_code_page_stateful(int code_page,
7735
                          const char *s, Py_ssize_t size,
7736
                          const char *errors, Py_ssize_t *consumed)
7737
{
7738
    wchar_t *buf = NULL;
7739
    Py_ssize_t bufsize = 0;
7740
    int chunk_size, final, converted, done;
7741
7742
    if (code_page < 0) {
7743
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7744
        return NULL;
7745
    }
7746
    if (size < 0) {
7747
        PyErr_BadInternalCall();
7748
        return NULL;
7749
    }
7750
7751
    if (consumed)
7752
        *consumed = 0;
7753
7754
    do
7755
    {
7756
#ifdef NEED_RETRY
7757
        if (size > DECODING_CHUNK_SIZE) {
7758
            chunk_size = DECODING_CHUNK_SIZE;
7759
            final = 0;
7760
            done = 0;
7761
        }
7762
        else
7763
#endif
7764
        {
7765
            chunk_size = (int)size;
7766
            final = (consumed == NULL);
7767
            done = 1;
7768
        }
7769
7770
        if (chunk_size == 0 && done) {
7771
            if (buf != NULL)
7772
                break;
7773
            _Py_RETURN_UNICODE_EMPTY();
7774
        }
7775
7776
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7777
                                            s, chunk_size);
7778
        if (converted == -2)
7779
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7780
                                                s, chunk_size,
7781
                                                errors, final);
7782
        assert(converted != 0 || done);
7783
7784
        if (converted < 0) {
7785
            PyMem_Free(buf);
7786
            return NULL;
7787
        }
7788
7789
        if (consumed)
7790
            *consumed += converted;
7791
7792
        s += converted;
7793
        size -= converted;
7794
    } while (!done);
7795
7796
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7797
    PyMem_Free(buf);
7798
    return v;
7799
}
7800
7801
PyObject *
7802
PyUnicode_DecodeCodePageStateful(int code_page,
7803
                                 const char *s,
7804
                                 Py_ssize_t size,
7805
                                 const char *errors,
7806
                                 Py_ssize_t *consumed)
7807
{
7808
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7809
}
7810
7811
PyObject *
7812
PyUnicode_DecodeMBCSStateful(const char *s,
7813
                             Py_ssize_t size,
7814
                             const char *errors,
7815
                             Py_ssize_t *consumed)
7816
{
7817
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7818
}
7819
7820
PyObject *
7821
PyUnicode_DecodeMBCS(const char *s,
7822
                     Py_ssize_t size,
7823
                     const char *errors)
7824
{
7825
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7826
}
7827
7828
static DWORD
7829
encode_code_page_flags(UINT code_page, const char *errors)
7830
{
7831
    if (code_page == CP_UTF8) {
7832
        return WC_ERR_INVALID_CHARS;
7833
    }
7834
    else if (code_page == CP_UTF7) {
7835
        /* CP_UTF7 only supports flags=0 */
7836
        return 0;
7837
    }
7838
    else {
7839
        if (errors != NULL && strcmp(errors, "replace") == 0)
7840
            return 0;
7841
        else
7842
            return WC_NO_BEST_FIT_CHARS;
7843
    }
7844
}
7845
7846
/*
7847
 * Encode a Unicode string to a Windows code page into a byte string in strict
7848
 * mode.
7849
 *
7850
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7851
 * an OSError and returns -1 on other error.
7852
 */
7853
static int
7854
encode_code_page_strict(UINT code_page, PyBytesWriter **writer,
7855
                        PyObject *unicode, Py_ssize_t offset, int len,
7856
                        const char* errors)
7857
{
7858
    BOOL usedDefaultChar = FALSE;
7859
    BOOL *pusedDefaultChar = &usedDefaultChar;
7860
    int outsize;
7861
    wchar_t *p;
7862
    Py_ssize_t size;
7863
    const DWORD flags = encode_code_page_flags(code_page, NULL);
7864
    char *out;
7865
    /* Create a substring so that we can get the UTF-16 representation
7866
       of just the slice under consideration. */
7867
    PyObject *substring;
7868
    int ret = -1;
7869
7870
    assert(len > 0);
7871
7872
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7873
        pusedDefaultChar = &usedDefaultChar;
7874
    else
7875
        pusedDefaultChar = NULL;
7876
7877
    substring = PyUnicode_Substring(unicode, offset, offset+len);
7878
    if (substring == NULL)
7879
        return -1;
7880
    p = PyUnicode_AsWideCharString(substring, &size);
7881
    Py_CLEAR(substring);
7882
    if (p == NULL) {
7883
        return -1;
7884
    }
7885
    assert(size <= INT_MAX);
7886
7887
    /* First get the size of the result */
7888
    outsize = WideCharToMultiByte(code_page, flags,
7889
                                  p, (int)size,
7890
                                  NULL, 0,
7891
                                  NULL, pusedDefaultChar);
7892
    if (outsize <= 0)
7893
        goto error;
7894
    /* If we used a default char, then we failed! */
7895
    if (pusedDefaultChar && *pusedDefaultChar) {
7896
        ret = -2;
7897
        goto done;
7898
    }
7899
7900
    if (*writer == NULL) {
7901
        /* Create string object */
7902
        *writer = PyBytesWriter_Create(outsize);
7903
        if (*writer == NULL) {
7904
            goto done;
7905
        }
7906
        out = PyBytesWriter_GetData(*writer);
7907
    }
7908
    else {
7909
        /* Extend string object */
7910
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
7911
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
7912
            goto done;
7913
        }
7914
        out = (char*)PyBytesWriter_GetData(*writer) + n;
7915
    }
7916
7917
    /* Do the conversion */
7918
    outsize = WideCharToMultiByte(code_page, flags,
7919
                                  p, (int)size,
7920
                                  out, outsize,
7921
                                  NULL, pusedDefaultChar);
7922
    if (outsize <= 0)
7923
        goto error;
7924
    if (pusedDefaultChar && *pusedDefaultChar) {
7925
        ret = -2;
7926
        goto done;
7927
    }
7928
    ret = 0;
7929
7930
done:
7931
    PyMem_Free(p);
7932
    return ret;
7933
7934
error:
7935
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7936
        ret = -2;
7937
        goto done;
7938
    }
7939
    PyErr_SetFromWindowsErr(0);
7940
    goto done;
7941
}
7942
7943
/*
7944
 * Encode a Unicode string to a Windows code page into a byte string using an
7945
 * error handler.
7946
 *
7947
 * Returns consumed characters if succeed, or raise an OSError and returns
7948
 * -1 on other error.
7949
 */
7950
static int
7951
encode_code_page_errors(UINT code_page, PyBytesWriter **writer,
7952
                        PyObject *unicode, Py_ssize_t unicode_offset,
7953
                        Py_ssize_t insize, const char* errors)
7954
{
7955
    const DWORD flags = encode_code_page_flags(code_page, errors);
7956
    Py_ssize_t pos = unicode_offset;
7957
    Py_ssize_t endin = unicode_offset + insize;
7958
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7959
       2000 English version of the message. */
7960
    const char *reason = "invalid character";
7961
    /* 4=maximum length of a UTF-8 sequence */
7962
    char buffer[4];
7963
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7964
    Py_ssize_t outsize;
7965
    char *out;
7966
    PyObject *errorHandler = NULL;
7967
    PyObject *exc = NULL;
7968
    PyObject *encoding_obj = NULL;
7969
    const char *encoding;
7970
    Py_ssize_t newpos;
7971
    PyObject *rep;
7972
    int ret = -1;
7973
7974
    assert(insize > 0);
7975
7976
    encoding = code_page_name(code_page, &encoding_obj);
7977
    if (encoding == NULL)
7978
        return -1;
7979
7980
    if (errors == NULL || strcmp(errors, "strict") == 0) {
7981
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7982
           then we raise a UnicodeEncodeError. */
7983
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7984
        if (exc != NULL) {
7985
            PyCodec_StrictErrors(exc);
7986
            Py_DECREF(exc);
7987
        }
7988
        Py_XDECREF(encoding_obj);
7989
        return -1;
7990
    }
7991
7992
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7993
        pusedDefaultChar = &usedDefaultChar;
7994
    else
7995
        pusedDefaultChar = NULL;
7996
7997
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7998
        PyErr_NoMemory();
7999
        goto error;
8000
    }
8001
    outsize = insize * Py_ARRAY_LENGTH(buffer);
8002
8003
    if (*writer == NULL) {
8004
        /* Create string object */
8005
        *writer = PyBytesWriter_Create(outsize);
8006
        if (*writer == NULL) {
8007
            goto error;
8008
        }
8009
        out = PyBytesWriter_GetData(*writer);
8010
    }
8011
    else {
8012
        /* Extend string object */
8013
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
8014
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
8015
            goto error;
8016
        }
8017
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8018
    }
8019
8020
    /* Encode the string character per character */
8021
    while (pos < endin)
8022
    {
8023
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8024
        wchar_t chars[2];
8025
        int charsize;
8026
        if (ch < 0x10000) {
8027
            chars[0] = (wchar_t)ch;
8028
            charsize = 1;
8029
        }
8030
        else {
8031
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8032
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8033
            charsize = 2;
8034
        }
8035
8036
        outsize = WideCharToMultiByte(code_page, flags,
8037
                                      chars, charsize,
8038
                                      buffer, Py_ARRAY_LENGTH(buffer),
8039
                                      NULL, pusedDefaultChar);
8040
        if (outsize > 0) {
8041
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8042
            {
8043
                pos++;
8044
                memcpy(out, buffer, outsize);
8045
                out += outsize;
8046
                continue;
8047
            }
8048
        }
8049
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8050
            PyErr_SetFromWindowsErr(0);
8051
            goto error;
8052
        }
8053
8054
        rep = unicode_encode_call_errorhandler(
8055
                  errors, &errorHandler, encoding, reason,
8056
                  unicode, &exc,
8057
                  pos, pos + 1, &newpos);
8058
        if (rep == NULL)
8059
            goto error;
8060
8061
        Py_ssize_t morebytes = pos - newpos;
8062
        if (PyBytes_Check(rep)) {
8063
            outsize = PyBytes_GET_SIZE(rep);
8064
            morebytes += outsize;
8065
            if (morebytes > 0) {
8066
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8067
                if (out == NULL) {
8068
                    Py_DECREF(rep);
8069
                    goto error;
8070
                }
8071
            }
8072
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8073
            out += outsize;
8074
        }
8075
        else {
8076
            Py_ssize_t i;
8077
            int kind;
8078
            const void *data;
8079
8080
            outsize = PyUnicode_GET_LENGTH(rep);
8081
            morebytes += outsize;
8082
            if (morebytes > 0) {
8083
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8084
                if (out == NULL) {
8085
                    Py_DECREF(rep);
8086
                    goto error;
8087
                }
8088
            }
8089
            kind = PyUnicode_KIND(rep);
8090
            data = PyUnicode_DATA(rep);
8091
            for (i=0; i < outsize; i++) {
8092
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8093
                if (ch > 127) {
8094
                    raise_encode_exception(&exc,
8095
                        encoding, unicode,
8096
                        pos, pos + 1,
8097
                        "unable to encode error handler result to ASCII");
8098
                    Py_DECREF(rep);
8099
                    goto error;
8100
                }
8101
                *out = (unsigned char)ch;
8102
                out++;
8103
            }
8104
        }
8105
        pos = newpos;
8106
        Py_DECREF(rep);
8107
    }
8108
    /* write a NUL byte */
8109
    *out = 0;
8110
    outsize = out - (char*)PyBytesWriter_GetData(*writer);
8111
    assert(outsize <= PyBytesWriter_GetSize(*writer));
8112
    if (PyBytesWriter_Resize(*writer, outsize) < 0) {
8113
        goto error;
8114
    }
8115
    ret = 0;
8116
8117
error:
8118
    Py_XDECREF(encoding_obj);
8119
    Py_XDECREF(errorHandler);
8120
    Py_XDECREF(exc);
8121
    return ret;
8122
}
8123
8124
8125
PyObject *
8126
PyUnicode_EncodeCodePage(int code_page,
8127
                         PyObject *unicode,
8128
                         const char *errors)
8129
{
8130
    Py_ssize_t len;
8131
    PyBytesWriter *writer = NULL;
8132
    Py_ssize_t offset;
8133
    int chunk_len, ret, done;
8134
8135
    if (!PyUnicode_Check(unicode)) {
8136
        PyErr_BadArgument();
8137
        return NULL;
8138
    }
8139
8140
    len = PyUnicode_GET_LENGTH(unicode);
8141
8142
    if (code_page < 0) {
8143
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8144
        return NULL;
8145
    }
8146
8147
    if (len == 0)
8148
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8149
8150
    offset = 0;
8151
    do
8152
    {
8153
#ifdef NEED_RETRY
8154
        if (len > DECODING_CHUNK_SIZE) {
8155
            chunk_len = DECODING_CHUNK_SIZE;
8156
            done = 0;
8157
        }
8158
        else
8159
#endif
8160
        {
8161
            chunk_len = (int)len;
8162
            done = 1;
8163
        }
8164
8165
        ret = encode_code_page_strict(code_page, &writer,
8166
                                      unicode, offset, chunk_len,
8167
                                      errors);
8168
        if (ret == -2)
8169
            ret = encode_code_page_errors(code_page, &writer,
8170
                                          unicode, offset,
8171
                                          chunk_len, errors);
8172
        if (ret < 0) {
8173
            PyBytesWriter_Discard(writer);
8174
            return NULL;
8175
        }
8176
8177
        offset += chunk_len;
8178
        len -= chunk_len;
8179
    } while (!done);
8180
8181
    return PyBytesWriter_Finish(writer);
8182
}
8183
8184
8185
PyObject *
8186
PyUnicode_AsMBCSString(PyObject *unicode)
8187
{
8188
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8189
}
8190
8191
#undef NEED_RETRY
8192
8193
#endif /* MS_WINDOWS */
8194
8195
/* --- Character Mapping Codec -------------------------------------------- */
8196
8197
static int
8198
charmap_decode_string(const char *s,
8199
                      Py_ssize_t size,
8200
                      PyObject *mapping,
8201
                      const char *errors,
8202
                      _PyUnicodeWriter *writer)
8203
737k
{
8204
737k
    const char *starts = s;
8205
737k
    const char *e;
8206
737k
    Py_ssize_t startinpos, endinpos;
8207
737k
    PyObject *errorHandler = NULL, *exc = NULL;
8208
737k
    Py_ssize_t maplen;
8209
737k
    int mapkind;
8210
737k
    const void *mapdata;
8211
737k
    Py_UCS4 x;
8212
737k
    unsigned char ch;
8213
8214
737k
    maplen = PyUnicode_GET_LENGTH(mapping);
8215
737k
    mapdata = PyUnicode_DATA(mapping);
8216
737k
    mapkind = PyUnicode_KIND(mapping);
8217
8218
737k
    e = s + size;
8219
8220
737k
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8221
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8222
         * is disabled in encoding aliases, latin1 is preferred because
8223
         * its implementation is faster. */
8224
130
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8225
130
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8226
130
        Py_UCS4 maxchar = writer->maxchar;
8227
8228
130
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8229
1.05M
        while (s < e) {
8230
1.05M
            ch = *s;
8231
1.05M
            x = mapdata_ucs1[ch];
8232
1.05M
            if (x > maxchar) {
8233
120
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8234
0
                    goto onError;
8235
120
                maxchar = writer->maxchar;
8236
120
                outdata = (Py_UCS1 *)writer->data;
8237
120
            }
8238
1.05M
            outdata[writer->pos] = x;
8239
1.05M
            writer->pos++;
8240
1.05M
            ++s;
8241
1.05M
        }
8242
130
        return 0;
8243
130
    }
8244
8245
849k
    while (s < e) {
8246
834k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8247
834k
            int outkind = writer->kind;
8248
834k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8249
834k
            if (outkind == PyUnicode_1BYTE_KIND) {
8250
773k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8251
773k
                Py_UCS4 maxchar = writer->maxchar;
8252
21.6M
                while (s < e) {
8253
20.9M
                    ch = *s;
8254
20.9M
                    x = mapdata_ucs2[ch];
8255
20.9M
                    if (x > maxchar)
8256
77.0k
                        goto Error;
8257
20.8M
                    outdata[writer->pos] = x;
8258
20.8M
                    writer->pos++;
8259
20.8M
                    ++s;
8260
20.8M
                }
8261
696k
                break;
8262
773k
            }
8263
60.6k
            else if (outkind == PyUnicode_2BYTE_KIND) {
8264
60.6k
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8265
28.1M
                while (s < e) {
8266
28.1M
                    ch = *s;
8267
28.1M
                    x = mapdata_ucs2[ch];
8268
28.1M
                    if (x == 0xFFFE)
8269
35.6k
                        goto Error;
8270
28.0M
                    outdata[writer->pos] = x;
8271
28.0M
                    writer->pos++;
8272
28.0M
                    ++s;
8273
28.0M
                }
8274
25.0k
                break;
8275
60.6k
            }
8276
834k
        }
8277
0
        ch = *s;
8278
8279
0
        if (ch < maplen)
8280
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8281
0
        else
8282
0
            x = 0xfffe; /* invalid value */
8283
112k
Error:
8284
112k
        if (x == 0xfffe)
8285
56.6k
        {
8286
            /* undefined mapping */
8287
56.6k
            startinpos = s-starts;
8288
56.6k
            endinpos = startinpos+1;
8289
56.6k
            if (unicode_decode_call_errorhandler_writer(
8290
56.6k
                    errors, &errorHandler,
8291
56.6k
                    "charmap", "character maps to <undefined>",
8292
56.6k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8293
56.6k
                    writer)) {
8294
20
                goto onError;
8295
20
            }
8296
56.6k
            continue;
8297
56.6k
        }
8298
8299
55.9k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8300
0
            goto onError;
8301
55.9k
        ++s;
8302
55.9k
    }
8303
736k
    Py_XDECREF(errorHandler);
8304
736k
    Py_XDECREF(exc);
8305
736k
    return 0;
8306
8307
20
onError:
8308
20
    Py_XDECREF(errorHandler);
8309
20
    Py_XDECREF(exc);
8310
20
    return -1;
8311
736k
}
8312
8313
static int
8314
charmap_decode_mapping(const char *s,
8315
                       Py_ssize_t size,
8316
                       PyObject *mapping,
8317
                       const char *errors,
8318
                       _PyUnicodeWriter *writer)
8319
0
{
8320
0
    const char *starts = s;
8321
0
    const char *e;
8322
0
    Py_ssize_t startinpos, endinpos;
8323
0
    PyObject *errorHandler = NULL, *exc = NULL;
8324
0
    unsigned char ch;
8325
0
    PyObject *key, *item = NULL;
8326
8327
0
    e = s + size;
8328
8329
0
    while (s < e) {
8330
0
        ch = *s;
8331
8332
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8333
0
        key = PyLong_FromLong((long)ch);
8334
0
        if (key == NULL)
8335
0
            goto onError;
8336
8337
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8338
0
        Py_DECREF(key);
8339
0
        if (rc == 0) {
8340
            /* No mapping found means: mapping is undefined. */
8341
0
            goto Undefined;
8342
0
        }
8343
0
        if (item == NULL) {
8344
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8345
                /* No mapping found means: mapping is undefined. */
8346
0
                PyErr_Clear();
8347
0
                goto Undefined;
8348
0
            } else
8349
0
                goto onError;
8350
0
        }
8351
8352
        /* Apply mapping */
8353
0
        if (item == Py_None)
8354
0
            goto Undefined;
8355
0
        if (PyLong_Check(item)) {
8356
0
            long value = PyLong_AsLong(item);
8357
0
            if (value == 0xFFFE)
8358
0
                goto Undefined;
8359
0
            if (value < 0 || value > MAX_UNICODE) {
8360
0
                PyErr_Format(PyExc_TypeError,
8361
0
                             "character mapping must be in range(0x%lx)",
8362
0
                             (unsigned long)MAX_UNICODE + 1);
8363
0
                goto onError;
8364
0
            }
8365
8366
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8367
0
                goto onError;
8368
0
        }
8369
0
        else if (PyUnicode_Check(item)) {
8370
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8371
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8372
0
                if (value == 0xFFFE)
8373
0
                    goto Undefined;
8374
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8375
0
                    goto onError;
8376
0
            }
8377
0
            else {
8378
0
                writer->overallocate = 1;
8379
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8380
0
                    goto onError;
8381
0
            }
8382
0
        }
8383
0
        else {
8384
            /* wrong return value */
8385
0
            PyErr_SetString(PyExc_TypeError,
8386
0
                            "character mapping must return integer, None or str");
8387
0
            goto onError;
8388
0
        }
8389
0
        Py_CLEAR(item);
8390
0
        ++s;
8391
0
        continue;
8392
8393
0
Undefined:
8394
        /* undefined mapping */
8395
0
        Py_CLEAR(item);
8396
0
        startinpos = s-starts;
8397
0
        endinpos = startinpos+1;
8398
0
        if (unicode_decode_call_errorhandler_writer(
8399
0
                errors, &errorHandler,
8400
0
                "charmap", "character maps to <undefined>",
8401
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8402
0
                writer)) {
8403
0
            goto onError;
8404
0
        }
8405
0
    }
8406
0
    Py_XDECREF(errorHandler);
8407
0
    Py_XDECREF(exc);
8408
0
    return 0;
8409
8410
0
onError:
8411
0
    Py_XDECREF(item);
8412
0
    Py_XDECREF(errorHandler);
8413
0
    Py_XDECREF(exc);
8414
0
    return -1;
8415
0
}
8416
8417
PyObject *
8418
PyUnicode_DecodeCharmap(const char *s,
8419
                        Py_ssize_t size,
8420
                        PyObject *mapping,
8421
                        const char *errors)
8422
737k
{
8423
737k
    _PyUnicodeWriter writer;
8424
8425
    /* Default to Latin-1 */
8426
737k
    if (mapping == NULL)
8427
21
        return PyUnicode_DecodeLatin1(s, size, errors);
8428
8429
737k
    if (size == 0)
8430
0
        _Py_RETURN_UNICODE_EMPTY();
8431
737k
    _PyUnicodeWriter_Init(&writer);
8432
737k
    writer.min_length = size;
8433
737k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8434
0
        goto onError;
8435
8436
737k
    if (PyUnicode_CheckExact(mapping)) {
8437
737k
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8438
20
            goto onError;
8439
737k
    }
8440
0
    else {
8441
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8442
0
            goto onError;
8443
0
    }
8444
737k
    return _PyUnicodeWriter_Finish(&writer);
8445
8446
20
  onError:
8447
20
    _PyUnicodeWriter_Dealloc(&writer);
8448
20
    return NULL;
8449
737k
}
8450
8451
/* Charmap encoding: the lookup table */
8452
8453
/*[clinic input]
8454
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8455
[clinic start generated code]*/
8456
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8457
8458
struct encoding_map {
8459
    PyObject_HEAD
8460
    unsigned char level1[32];
8461
    int count2, count3;
8462
    unsigned char level23[1];
8463
};
8464
8465
/*[clinic input]
8466
EncodingMap.size
8467
8468
Return the size (in bytes) of this object.
8469
[clinic start generated code]*/
8470
8471
static PyObject *
8472
EncodingMap_size_impl(struct encoding_map *self)
8473
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8474
0
{
8475
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8476
0
                           128*self->count3);
8477
0
}
8478
8479
static PyMethodDef encoding_map_methods[] = {
8480
    ENCODINGMAP_SIZE_METHODDEF
8481
    {NULL, NULL}
8482
};
8483
8484
static PyTypeObject EncodingMapType = {
8485
    PyVarObject_HEAD_INIT(NULL, 0)
8486
    .tp_name = "EncodingMap",
8487
    .tp_basicsize = sizeof(struct encoding_map),
8488
    /* methods */
8489
    .tp_flags = Py_TPFLAGS_DEFAULT,
8490
    .tp_methods = encoding_map_methods,
8491
};
8492
8493
PyObject*
8494
PyUnicode_BuildEncodingMap(PyObject* string)
8495
136
{
8496
136
    PyObject *result;
8497
136
    struct encoding_map *mresult;
8498
136
    int i;
8499
136
    int need_dict = 0;
8500
136
    unsigned char level1[32];
8501
136
    unsigned char level2[512];
8502
136
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8503
136
    int count2 = 0, count3 = 0;
8504
136
    int kind;
8505
136
    const void *data;
8506
136
    int length;
8507
136
    Py_UCS4 ch;
8508
8509
136
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8510
0
        PyErr_BadArgument();
8511
0
        return NULL;
8512
0
    }
8513
136
    kind = PyUnicode_KIND(string);
8514
136
    data = PyUnicode_DATA(string);
8515
136
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8516
136
    memset(level1, 0xFF, sizeof level1);
8517
136
    memset(level2, 0xFF, sizeof level2);
8518
8519
    /* If there isn't a one-to-one mapping of NULL to \0,
8520
       or if there are non-BMP characters, we need to use
8521
       a mapping dictionary. */
8522
136
    if (PyUnicode_READ(kind, data, 0) != 0)
8523
0
        need_dict = 1;
8524
34.8k
    for (i = 1; i < length; i++) {
8525
34.6k
        int l1, l2;
8526
34.6k
        ch = PyUnicode_READ(kind, data, i);
8527
34.6k
        if (ch == 0 || ch > 0xFFFF) {
8528
0
            need_dict = 1;
8529
0
            break;
8530
0
        }
8531
34.6k
        if (ch == 0xFFFE)
8532
            /* unmapped character */
8533
978
            continue;
8534
33.7k
        l1 = ch >> 11;
8535
33.7k
        l2 = ch >> 7;
8536
33.7k
        if (level1[l1] == 0xFF)
8537
250
            level1[l1] = count2++;
8538
33.7k
        if (level2[l2] == 0xFF)
8539
739
            level2[l2] = count3++;
8540
33.7k
    }
8541
8542
136
    if (count2 >= 0xFF || count3 >= 0xFF)
8543
0
        need_dict = 1;
8544
8545
136
    if (need_dict) {
8546
0
        PyObject *result = PyDict_New();
8547
0
        if (!result)
8548
0
            return NULL;
8549
0
        for (i = 0; i < length; i++) {
8550
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8551
0
            PyObject *key = PyLong_FromLong(c);
8552
0
            if (key == NULL) {
8553
0
                Py_DECREF(result);
8554
0
                return NULL;
8555
0
            }
8556
0
            PyObject *value = PyLong_FromLong(i);
8557
0
            if (value == NULL) {
8558
0
                Py_DECREF(key);
8559
0
                Py_DECREF(result);
8560
0
                return NULL;
8561
0
            }
8562
0
            int rc = PyDict_SetItem(result, key, value);
8563
0
            Py_DECREF(key);
8564
0
            Py_DECREF(value);
8565
0
            if (rc < 0) {
8566
0
                Py_DECREF(result);
8567
0
                return NULL;
8568
0
            }
8569
0
        }
8570
0
        return result;
8571
0
    }
8572
8573
    /* Create a three-level trie */
8574
136
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8575
136
                             16*count2 + 128*count3 - 1);
8576
136
    if (!result) {
8577
0
        return PyErr_NoMemory();
8578
0
    }
8579
8580
136
    _PyObject_Init(result, &EncodingMapType);
8581
136
    mresult = (struct encoding_map*)result;
8582
136
    mresult->count2 = count2;
8583
136
    mresult->count3 = count3;
8584
136
    mlevel1 = mresult->level1;
8585
136
    mlevel2 = mresult->level23;
8586
136
    mlevel3 = mresult->level23 + 16*count2;
8587
136
    memcpy(mlevel1, level1, 32);
8588
136
    memset(mlevel2, 0xFF, 16*count2);
8589
136
    memset(mlevel3, 0, 128*count3);
8590
136
    count3 = 0;
8591
34.8k
    for (i = 1; i < length; i++) {
8592
34.6k
        int o1, o2, o3, i2, i3;
8593
34.6k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8594
34.6k
        if (ch == 0xFFFE)
8595
            /* unmapped character */
8596
978
            continue;
8597
33.7k
        o1 = ch>>11;
8598
33.7k
        o2 = (ch>>7) & 0xF;
8599
33.7k
        i2 = 16*mlevel1[o1] + o2;
8600
33.7k
        if (mlevel2[i2] == 0xFF)
8601
739
            mlevel2[i2] = count3++;
8602
33.7k
        o3 = ch & 0x7F;
8603
33.7k
        i3 = 128*mlevel2[i2] + o3;
8604
33.7k
        mlevel3[i3] = i;
8605
33.7k
    }
8606
136
    return result;
8607
136
}
8608
8609
static int
8610
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8611
0
{
8612
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8613
0
    int l1 = c>>11;
8614
0
    int l2 = (c>>7) & 0xF;
8615
0
    int l3 = c & 0x7F;
8616
0
    int i;
8617
8618
0
    if (c > 0xFFFF)
8619
0
        return -1;
8620
0
    if (c == 0)
8621
0
        return 0;
8622
    /* level 1*/
8623
0
    i = map->level1[l1];
8624
0
    if (i == 0xFF) {
8625
0
        return -1;
8626
0
    }
8627
    /* level 2*/
8628
0
    i = map->level23[16*i+l2];
8629
0
    if (i == 0xFF) {
8630
0
        return -1;
8631
0
    }
8632
    /* level 3 */
8633
0
    i = map->level23[16*map->count2 + 128*i + l3];
8634
0
    if (i == 0) {
8635
0
        return -1;
8636
0
    }
8637
0
    return i;
8638
0
}
8639
8640
/* Lookup the character in the mapping.
8641
   On success, return PyLong, PyBytes or None (if the character can't be found).
8642
   If the result is PyLong, put its value in replace.
8643
   On error, return NULL.
8644
   */
8645
static PyObject *
8646
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8647
0
{
8648
0
    PyObject *w = PyLong_FromLong((long)c);
8649
0
    PyObject *x;
8650
8651
0
    if (w == NULL)
8652
0
        return NULL;
8653
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8654
0
    Py_DECREF(w);
8655
0
    if (rc == 0) {
8656
        /* No mapping found means: mapping is undefined. */
8657
0
        Py_RETURN_NONE;
8658
0
    }
8659
0
    if (x == NULL) {
8660
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8661
            /* No mapping found means: mapping is undefined. */
8662
0
            PyErr_Clear();
8663
0
            Py_RETURN_NONE;
8664
0
        } else
8665
0
            return NULL;
8666
0
    }
8667
0
    else if (x == Py_None)
8668
0
        return x;
8669
0
    else if (PyLong_Check(x)) {
8670
0
        long value = PyLong_AsLong(x);
8671
0
        if (value < 0 || value > 255) {
8672
0
            PyErr_SetString(PyExc_TypeError,
8673
0
                            "character mapping must be in range(256)");
8674
0
            Py_DECREF(x);
8675
0
            return NULL;
8676
0
        }
8677
0
        *replace = (unsigned char)value;
8678
0
        return x;
8679
0
    }
8680
0
    else if (PyBytes_Check(x))
8681
0
        return x;
8682
0
    else {
8683
        /* wrong return value */
8684
0
        PyErr_Format(PyExc_TypeError,
8685
0
                     "character mapping must return integer, bytes or None, not %.400s",
8686
0
                     Py_TYPE(x)->tp_name);
8687
0
        Py_DECREF(x);
8688
0
        return NULL;
8689
0
    }
8690
0
}
8691
8692
static int
8693
charmapencode_resize(PyBytesWriter *writer, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8694
0
{
8695
0
    Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8696
    /* exponentially overallocate to minimize reallocations */
8697
0
    if (requiredsize < 2 * outsize)
8698
0
        requiredsize = 2 * outsize;
8699
0
    return PyBytesWriter_Resize(writer, requiredsize);
8700
0
}
8701
8702
typedef enum charmapencode_result {
8703
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8704
} charmapencode_result;
8705
/* lookup the character, put the result in the output string and adjust
8706
   various state variables. Resize the output bytes object if not enough
8707
   space is available. Return a new reference to the object that
8708
   was put in the output buffer, or Py_None, if the mapping was undefined
8709
   (in which case no character was written) or NULL, if a
8710
   reallocation error occurred. The caller must decref the result */
8711
static charmapencode_result
8712
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8713
                     PyBytesWriter *writer, Py_ssize_t *outpos)
8714
0
{
8715
0
    PyObject *rep;
8716
0
    unsigned char replace;
8717
0
    char *outstart;
8718
0
    Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8719
8720
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8721
0
        int res = encoding_map_lookup(c, mapping);
8722
0
        Py_ssize_t requiredsize = *outpos+1;
8723
0
        if (res == -1) {
8724
0
            return enc_FAILED;
8725
0
        }
8726
8727
0
        if (outsize<requiredsize) {
8728
0
            if (charmapencode_resize(writer, outpos, requiredsize)) {
8729
0
                return enc_EXCEPTION;
8730
0
            }
8731
0
        }
8732
0
        outstart = _PyBytesWriter_GetData(writer);
8733
0
        outstart[(*outpos)++] = (char)res;
8734
0
        return enc_SUCCESS;
8735
0
    }
8736
8737
0
    rep = charmapencode_lookup(c, mapping, &replace);
8738
0
    if (rep==NULL)
8739
0
        return enc_EXCEPTION;
8740
0
    else if (rep==Py_None) {
8741
0
        Py_DECREF(rep);
8742
0
        return enc_FAILED;
8743
0
    } else {
8744
0
        if (PyLong_Check(rep)) {
8745
0
            Py_ssize_t requiredsize = *outpos+1;
8746
0
            if (outsize<requiredsize)
8747
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8748
0
                    Py_DECREF(rep);
8749
0
                    return enc_EXCEPTION;
8750
0
                }
8751
0
            outstart = _PyBytesWriter_GetData(writer);
8752
0
            outstart[(*outpos)++] = (char)replace;
8753
0
        }
8754
0
        else {
8755
0
            const char *repchars = PyBytes_AS_STRING(rep);
8756
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8757
0
            Py_ssize_t requiredsize = *outpos+repsize;
8758
0
            if (outsize<requiredsize)
8759
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8760
0
                    Py_DECREF(rep);
8761
0
                    return enc_EXCEPTION;
8762
0
                }
8763
0
            outstart = _PyBytesWriter_GetData(writer);
8764
0
            memcpy(outstart + *outpos, repchars, repsize);
8765
0
            *outpos += repsize;
8766
0
        }
8767
0
    }
8768
0
    Py_DECREF(rep);
8769
0
    return enc_SUCCESS;
8770
0
}
8771
8772
/* handle an error in _PyUnicode_EncodeCharmap()
8773
   Return 0 on success, -1 on error */
8774
static int
8775
charmap_encoding_error(
8776
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8777
    PyObject **exceptionObject,
8778
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8779
    PyBytesWriter *writer, Py_ssize_t *respos)
8780
0
{
8781
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8782
0
    Py_ssize_t size, repsize;
8783
0
    Py_ssize_t newpos;
8784
0
    int kind;
8785
0
    const void *data;
8786
0
    Py_ssize_t index;
8787
    /* startpos for collecting unencodable chars */
8788
0
    Py_ssize_t collstartpos = *inpos;
8789
0
    Py_ssize_t collendpos = *inpos+1;
8790
0
    Py_ssize_t collpos;
8791
0
    const char *encoding = "charmap";
8792
0
    const char *reason = "character maps to <undefined>";
8793
0
    charmapencode_result x;
8794
0
    Py_UCS4 ch;
8795
0
    int val;
8796
8797
0
    size = PyUnicode_GET_LENGTH(unicode);
8798
    /* find all unencodable characters */
8799
0
    while (collendpos < size) {
8800
0
        PyObject *rep;
8801
0
        unsigned char replace;
8802
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8803
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8804
0
            val = encoding_map_lookup(ch, mapping);
8805
0
            if (val != -1)
8806
0
                break;
8807
0
            ++collendpos;
8808
0
            continue;
8809
0
        }
8810
8811
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8812
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8813
0
        if (rep==NULL)
8814
0
            return -1;
8815
0
        else if (rep!=Py_None) {
8816
0
            Py_DECREF(rep);
8817
0
            break;
8818
0
        }
8819
0
        Py_DECREF(rep);
8820
0
        ++collendpos;
8821
0
    }
8822
    /* cache callback name lookup
8823
     * (if not done yet, i.e. it's the first error) */
8824
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8825
0
        *error_handler = _Py_GetErrorHandler(errors);
8826
8827
0
    switch (*error_handler) {
8828
0
    case _Py_ERROR_STRICT:
8829
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8830
0
        return -1;
8831
8832
0
    case _Py_ERROR_REPLACE:
8833
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8834
0
            x = charmapencode_output('?', mapping, writer, respos);
8835
0
            if (x==enc_EXCEPTION) {
8836
0
                return -1;
8837
0
            }
8838
0
            else if (x==enc_FAILED) {
8839
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8840
0
                return -1;
8841
0
            }
8842
0
        }
8843
0
        _Py_FALLTHROUGH;
8844
0
    case _Py_ERROR_IGNORE:
8845
0
        *inpos = collendpos;
8846
0
        break;
8847
8848
0
    case _Py_ERROR_XMLCHARREFREPLACE:
8849
        /* generate replacement (temporarily (mis)uses p) */
8850
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8851
0
            char buffer[2+29+1+1];
8852
0
            char *cp;
8853
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8854
0
            for (cp = buffer; *cp; ++cp) {
8855
0
                x = charmapencode_output(*cp, mapping, writer, respos);
8856
0
                if (x==enc_EXCEPTION)
8857
0
                    return -1;
8858
0
                else if (x==enc_FAILED) {
8859
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8860
0
                    return -1;
8861
0
                }
8862
0
            }
8863
0
        }
8864
0
        *inpos = collendpos;
8865
0
        break;
8866
8867
0
    default:
8868
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8869
0
                                                      encoding, reason, unicode, exceptionObject,
8870
0
                                                      collstartpos, collendpos, &newpos);
8871
0
        if (repunicode == NULL)
8872
0
            return -1;
8873
0
        if (PyBytes_Check(repunicode)) {
8874
            /* Directly copy bytes result to output. */
8875
0
            Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8876
0
            Py_ssize_t requiredsize;
8877
0
            repsize = PyBytes_Size(repunicode);
8878
0
            requiredsize = *respos + repsize;
8879
0
            if (requiredsize > outsize)
8880
                /* Make room for all additional bytes. */
8881
0
                if (charmapencode_resize(writer, respos, requiredsize)) {
8882
0
                    Py_DECREF(repunicode);
8883
0
                    return -1;
8884
0
                }
8885
0
            memcpy((char*)PyBytesWriter_GetData(writer) + *respos,
8886
0
                   PyBytes_AsString(repunicode),  repsize);
8887
0
            *respos += repsize;
8888
0
            *inpos = newpos;
8889
0
            Py_DECREF(repunicode);
8890
0
            break;
8891
0
        }
8892
        /* generate replacement  */
8893
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
8894
0
        data = PyUnicode_DATA(repunicode);
8895
0
        kind = PyUnicode_KIND(repunicode);
8896
0
        for (index = 0; index < repsize; index++) {
8897
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8898
0
            x = charmapencode_output(repch, mapping, writer, respos);
8899
0
            if (x==enc_EXCEPTION) {
8900
0
                Py_DECREF(repunicode);
8901
0
                return -1;
8902
0
            }
8903
0
            else if (x==enc_FAILED) {
8904
0
                Py_DECREF(repunicode);
8905
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8906
0
                return -1;
8907
0
            }
8908
0
        }
8909
0
        *inpos = newpos;
8910
0
        Py_DECREF(repunicode);
8911
0
    }
8912
0
    return 0;
8913
0
}
8914
8915
PyObject *
8916
_PyUnicode_EncodeCharmap(PyObject *unicode,
8917
                         PyObject *mapping,
8918
                         const char *errors)
8919
0
{
8920
    /* Default to Latin-1 */
8921
0
    if (mapping == NULL) {
8922
0
        return unicode_encode_ucs1(unicode, errors, 256);
8923
0
    }
8924
8925
0
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
8926
0
    if (size == 0) {
8927
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8928
0
    }
8929
0
    const void *data = PyUnicode_DATA(unicode);
8930
0
    int kind = PyUnicode_KIND(unicode);
8931
8932
0
    PyObject *error_handler_obj = NULL;
8933
0
    PyObject *exc = NULL;
8934
8935
    /* output object */
8936
0
    PyBytesWriter *writer;
8937
    /* allocate enough for a simple encoding without
8938
       replacements, if we need more, we'll resize */
8939
0
    writer = PyBytesWriter_Create(size);
8940
0
    if (writer == NULL) {
8941
0
        goto onError;
8942
0
    }
8943
8944
    /* current input position */
8945
0
    Py_ssize_t inpos = 0;
8946
    /* current output position */
8947
0
    Py_ssize_t respos = 0;
8948
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8949
8950
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8951
0
        char *outstart = _PyBytesWriter_GetData(writer);
8952
0
        Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8953
8954
0
        while (inpos<size) {
8955
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8956
8957
            /* try to encode it */
8958
0
            int res = encoding_map_lookup(ch, mapping);
8959
0
            Py_ssize_t requiredsize = respos+1;
8960
0
            if (res == -1) {
8961
0
                goto enc_FAILED;
8962
0
            }
8963
8964
0
            if (outsize<requiredsize) {
8965
0
                if (charmapencode_resize(writer, &respos, requiredsize)) {
8966
0
                    goto onError;
8967
0
                }
8968
0
                outstart = _PyBytesWriter_GetData(writer);
8969
0
                outsize = _PyBytesWriter_GetSize(writer);
8970
0
            }
8971
0
            outstart[respos++] = (char)res;
8972
8973
            /* done with this character => adjust input position */
8974
0
            ++inpos;
8975
0
            continue;
8976
8977
0
enc_FAILED:
8978
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
8979
0
                                       &exc,
8980
0
                                       &error_handler, &error_handler_obj, errors,
8981
0
                                       writer, &respos)) {
8982
0
                goto onError;
8983
0
            }
8984
0
            outstart = _PyBytesWriter_GetData(writer);
8985
0
            outsize = _PyBytesWriter_GetSize(writer);
8986
0
        }
8987
0
    }
8988
0
    else {
8989
0
        while (inpos<size) {
8990
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8991
            /* try to encode it */
8992
0
            charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
8993
0
            if (x==enc_EXCEPTION) { /* error */
8994
0
                goto onError;
8995
0
            }
8996
0
            if (x==enc_FAILED) { /* unencodable character */
8997
0
                if (charmap_encoding_error(unicode, &inpos, mapping,
8998
0
                                           &exc,
8999
0
                                           &error_handler, &error_handler_obj, errors,
9000
0
                                           writer, &respos)) {
9001
0
                    goto onError;
9002
0
                }
9003
0
            }
9004
0
            else {
9005
                /* done with this character => adjust input position */
9006
0
                ++inpos;
9007
0
            }
9008
0
        }
9009
0
    }
9010
9011
0
    Py_XDECREF(exc);
9012
0
    Py_XDECREF(error_handler_obj);
9013
9014
    /* Resize if we allocated too much */
9015
0
    return PyBytesWriter_FinishWithSize(writer, respos);
9016
9017
0
  onError:
9018
0
    PyBytesWriter_Discard(writer);
9019
0
    Py_XDECREF(exc);
9020
0
    Py_XDECREF(error_handler_obj);
9021
0
    return NULL;
9022
0
}
9023
9024
PyObject *
9025
PyUnicode_AsCharmapString(PyObject *unicode,
9026
                          PyObject *mapping)
9027
0
{
9028
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9029
0
        PyErr_BadArgument();
9030
0
        return NULL;
9031
0
    }
9032
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9033
0
}
9034
9035
/* create or adjust a UnicodeTranslateError */
9036
static void
9037
make_translate_exception(PyObject **exceptionObject,
9038
                         PyObject *unicode,
9039
                         Py_ssize_t startpos, Py_ssize_t endpos,
9040
                         const char *reason)
9041
0
{
9042
0
    if (*exceptionObject == NULL) {
9043
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9044
0
            unicode, startpos, endpos, reason);
9045
0
    }
9046
0
    else {
9047
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9048
0
            goto onError;
9049
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9050
0
            goto onError;
9051
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9052
0
            goto onError;
9053
0
        return;
9054
0
      onError:
9055
0
        Py_CLEAR(*exceptionObject);
9056
0
    }
9057
0
}
9058
9059
/* error handling callback helper:
9060
   build arguments, call the callback and check the arguments,
9061
   put the result into newpos and return the replacement string, which
9062
   has to be freed by the caller */
9063
static PyObject *
9064
unicode_translate_call_errorhandler(const char *errors,
9065
                                    PyObject **errorHandler,
9066
                                    const char *reason,
9067
                                    PyObject *unicode, PyObject **exceptionObject,
9068
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9069
                                    Py_ssize_t *newpos)
9070
0
{
9071
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9072
9073
0
    Py_ssize_t i_newpos;
9074
0
    PyObject *restuple;
9075
0
    PyObject *resunicode;
9076
9077
0
    if (*errorHandler == NULL) {
9078
0
        *errorHandler = PyCodec_LookupError(errors);
9079
0
        if (*errorHandler == NULL)
9080
0
            return NULL;
9081
0
    }
9082
9083
0
    make_translate_exception(exceptionObject,
9084
0
                             unicode, startpos, endpos, reason);
9085
0
    if (*exceptionObject == NULL)
9086
0
        return NULL;
9087
9088
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9089
0
    if (restuple == NULL)
9090
0
        return NULL;
9091
0
    if (!PyTuple_Check(restuple)) {
9092
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9093
0
        Py_DECREF(restuple);
9094
0
        return NULL;
9095
0
    }
9096
0
    if (!PyArg_ParseTuple(restuple, argparse,
9097
0
                          &resunicode, &i_newpos)) {
9098
0
        Py_DECREF(restuple);
9099
0
        return NULL;
9100
0
    }
9101
0
    if (i_newpos<0)
9102
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9103
0
    else
9104
0
        *newpos = i_newpos;
9105
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9106
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9107
0
        Py_DECREF(restuple);
9108
0
        return NULL;
9109
0
    }
9110
0
    Py_INCREF(resunicode);
9111
0
    Py_DECREF(restuple);
9112
0
    return resunicode;
9113
0
}
9114
9115
/* Lookup the character ch in the mapping and put the result in result,
9116
   which must be decrefed by the caller.
9117
   The result can be PyLong, PyUnicode, None or NULL.
9118
   If the result is PyLong, put its value in replace.
9119
   Return 0 on success, -1 on error */
9120
static int
9121
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9122
742
{
9123
742
    PyObject *w = PyLong_FromLong((long)c);
9124
742
    PyObject *x;
9125
9126
742
    if (w == NULL)
9127
0
        return -1;
9128
742
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9129
742
    Py_DECREF(w);
9130
742
    if (rc == 0) {
9131
        /* No mapping found means: use 1:1 mapping. */
9132
326
        *result = NULL;
9133
326
        return 0;
9134
326
    }
9135
416
    if (x == NULL) {
9136
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9137
            /* No mapping found means: use 1:1 mapping. */
9138
0
            PyErr_Clear();
9139
0
            *result = NULL;
9140
0
            return 0;
9141
0
        } else
9142
0
            return -1;
9143
0
    }
9144
416
    else if (x == Py_None) {
9145
0
        *result = x;
9146
0
        return 0;
9147
0
    }
9148
416
    else if (PyLong_Check(x)) {
9149
0
        long value = PyLong_AsLong(x);
9150
0
        if (value < 0 || value > MAX_UNICODE) {
9151
0
            PyErr_Format(PyExc_ValueError,
9152
0
                         "character mapping must be in range(0x%lx)",
9153
0
                         (unsigned long)MAX_UNICODE + 1);
9154
0
            Py_DECREF(x);
9155
0
            return -1;
9156
0
        }
9157
0
        *result = x;
9158
0
        *replace = (Py_UCS4)value;
9159
0
        return 0;
9160
0
    }
9161
416
    else if (PyUnicode_Check(x)) {
9162
416
        *result = x;
9163
416
        return 0;
9164
416
    }
9165
0
    else {
9166
        /* wrong return value */
9167
0
        PyErr_SetString(PyExc_TypeError,
9168
0
                        "character mapping must return integer, None or str");
9169
0
        Py_DECREF(x);
9170
0
        return -1;
9171
0
    }
9172
416
}
9173
9174
/* lookup the character, write the result into the writer.
9175
   Return 1 if the result was written into the writer, return 0 if the mapping
9176
   was undefined, raise an exception return -1 on error. */
9177
static int
9178
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9179
                        _PyUnicodeWriter *writer)
9180
382
{
9181
382
    PyObject *item;
9182
382
    Py_UCS4 replace;
9183
9184
382
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9185
0
        return -1;
9186
9187
382
    if (item == NULL) {
9188
        /* not found => default to 1:1 mapping */
9189
122
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9190
0
            return -1;
9191
0
        }
9192
122
        return 1;
9193
122
    }
9194
9195
260
    if (item == Py_None) {
9196
0
        Py_DECREF(item);
9197
0
        return 0;
9198
0
    }
9199
9200
260
    if (PyLong_Check(item)) {
9201
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9202
0
            Py_DECREF(item);
9203
0
            return -1;
9204
0
        }
9205
0
        Py_DECREF(item);
9206
0
        return 1;
9207
0
    }
9208
9209
260
    if (!PyUnicode_Check(item)) {
9210
0
        Py_DECREF(item);
9211
0
        return -1;
9212
0
    }
9213
9214
260
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9215
0
        Py_DECREF(item);
9216
0
        return -1;
9217
0
    }
9218
9219
260
    Py_DECREF(item);
9220
260
    return 1;
9221
260
}
9222
9223
static int
9224
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9225
                              Py_UCS1 *translate)
9226
360
{
9227
360
    PyObject *item = NULL;
9228
360
    Py_UCS4 replace;
9229
360
    int ret = 0;
9230
9231
360
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9232
0
        return -1;
9233
0
    }
9234
9235
360
    if (item == Py_None) {
9236
        /* deletion */
9237
0
        translate[ch] = 0xfe;
9238
0
    }
9239
360
    else if (item == NULL) {
9240
        /* not found => default to 1:1 mapping */
9241
204
        translate[ch] = ch;
9242
204
        return 1;
9243
204
    }
9244
156
    else if (PyLong_Check(item)) {
9245
0
        if (replace > 127) {
9246
            /* invalid character or character outside ASCII:
9247
               skip the fast translate */
9248
0
            goto exit;
9249
0
        }
9250
0
        translate[ch] = (Py_UCS1)replace;
9251
0
    }
9252
156
    else if (PyUnicode_Check(item)) {
9253
156
        if (PyUnicode_GET_LENGTH(item) != 1)
9254
156
            goto exit;
9255
9256
0
        replace = PyUnicode_READ_CHAR(item, 0);
9257
0
        if (replace > 127)
9258
0
            goto exit;
9259
0
        translate[ch] = (Py_UCS1)replace;
9260
0
    }
9261
0
    else {
9262
        /* not None, NULL, long or unicode */
9263
0
        goto exit;
9264
0
    }
9265
0
    ret = 1;
9266
9267
156
  exit:
9268
156
    Py_DECREF(item);
9269
156
    return ret;
9270
0
}
9271
9272
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9273
   was translated into writer, return 0 if the input string was partially
9274
   translated into writer, raise an exception and return -1 on error. */
9275
static int
9276
unicode_fast_translate(PyObject *input, PyObject *mapping,
9277
                       _PyUnicodeWriter *writer, int ignore,
9278
                       Py_ssize_t *input_pos)
9279
300
{
9280
300
    Py_UCS1 ascii_table[128], ch, ch2;
9281
300
    Py_ssize_t len;
9282
300
    const Py_UCS1 *in, *end;
9283
300
    Py_UCS1 *out;
9284
300
    int res = 0;
9285
9286
300
    len = PyUnicode_GET_LENGTH(input);
9287
9288
300
    memset(ascii_table, 0xff, 128);
9289
9290
300
    in = PyUnicode_1BYTE_DATA(input);
9291
300
    end = in + len;
9292
9293
300
    assert(PyUnicode_IS_ASCII(writer->buffer));
9294
300
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9295
300
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9296
9297
546
    for (; in < end; in++) {
9298
402
        ch = *in;
9299
402
        ch2 = ascii_table[ch];
9300
402
        if (ch2 == 0xff) {
9301
360
            int translate = unicode_fast_translate_lookup(mapping, ch,
9302
360
                                                          ascii_table);
9303
360
            if (translate < 0)
9304
0
                return -1;
9305
360
            if (translate == 0)
9306
156
                goto exit;
9307
204
            ch2 = ascii_table[ch];
9308
204
        }
9309
246
        if (ch2 == 0xfe) {
9310
0
            if (ignore)
9311
0
                continue;
9312
0
            goto exit;
9313
0
        }
9314
246
        assert(ch2 < 128);
9315
246
        *out = ch2;
9316
246
        out++;
9317
246
    }
9318
144
    res = 1;
9319
9320
300
exit:
9321
300
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9322
300
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9323
300
    return res;
9324
144
}
9325
9326
static PyObject *
9327
_PyUnicode_TranslateCharmap(PyObject *input,
9328
                            PyObject *mapping,
9329
                            const char *errors)
9330
300
{
9331
    /* input object */
9332
300
    const void *data;
9333
300
    Py_ssize_t size, i;
9334
300
    int kind;
9335
    /* output buffer */
9336
300
    _PyUnicodeWriter writer;
9337
    /* error handler */
9338
300
    const char *reason = "character maps to <undefined>";
9339
300
    PyObject *errorHandler = NULL;
9340
300
    PyObject *exc = NULL;
9341
300
    int ignore;
9342
300
    int res;
9343
9344
300
    if (mapping == NULL) {
9345
0
        PyErr_BadArgument();
9346
0
        return NULL;
9347
0
    }
9348
9349
300
    data = PyUnicode_DATA(input);
9350
300
    kind = PyUnicode_KIND(input);
9351
300
    size = PyUnicode_GET_LENGTH(input);
9352
9353
300
    if (size == 0)
9354
0
        return PyUnicode_FromObject(input);
9355
9356
    /* allocate enough for a simple 1:1 translation without
9357
       replacements, if we need more, we'll resize */
9358
300
    _PyUnicodeWriter_Init(&writer);
9359
300
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9360
0
        goto onError;
9361
9362
300
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9363
9364
300
    if (PyUnicode_IS_ASCII(input)) {
9365
300
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9366
300
        if (res < 0) {
9367
0
            _PyUnicodeWriter_Dealloc(&writer);
9368
0
            return NULL;
9369
0
        }
9370
300
        if (res == 1)
9371
144
            return _PyUnicodeWriter_Finish(&writer);
9372
300
    }
9373
0
    else {
9374
0
        i = 0;
9375
0
    }
9376
9377
538
    while (i<size) {
9378
        /* try to encode it */
9379
382
        int translate;
9380
382
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9381
382
        Py_ssize_t newpos;
9382
        /* startpos for collecting untranslatable chars */
9383
382
        Py_ssize_t collstart;
9384
382
        Py_ssize_t collend;
9385
382
        Py_UCS4 ch;
9386
9387
382
        ch = PyUnicode_READ(kind, data, i);
9388
382
        translate = charmaptranslate_output(ch, mapping, &writer);
9389
382
        if (translate < 0)
9390
0
            goto onError;
9391
9392
382
        if (translate != 0) {
9393
            /* it worked => adjust input pointer */
9394
382
            ++i;
9395
382
            continue;
9396
382
        }
9397
9398
        /* untranslatable character */
9399
0
        collstart = i;
9400
0
        collend = i+1;
9401
9402
        /* find all untranslatable characters */
9403
0
        while (collend < size) {
9404
0
            PyObject *x;
9405
0
            Py_UCS4 replace;
9406
0
            ch = PyUnicode_READ(kind, data, collend);
9407
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9408
0
                goto onError;
9409
0
            Py_XDECREF(x);
9410
0
            if (x != Py_None)
9411
0
                break;
9412
0
            ++collend;
9413
0
        }
9414
9415
0
        if (ignore) {
9416
0
            i = collend;
9417
0
        }
9418
0
        else {
9419
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9420
0
                                                             reason, input, &exc,
9421
0
                                                             collstart, collend, &newpos);
9422
0
            if (repunicode == NULL)
9423
0
                goto onError;
9424
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9425
0
                Py_DECREF(repunicode);
9426
0
                goto onError;
9427
0
            }
9428
0
            Py_DECREF(repunicode);
9429
0
            i = newpos;
9430
0
        }
9431
0
    }
9432
156
    Py_XDECREF(exc);
9433
156
    Py_XDECREF(errorHandler);
9434
156
    return _PyUnicodeWriter_Finish(&writer);
9435
9436
0
  onError:
9437
0
    _PyUnicodeWriter_Dealloc(&writer);
9438
0
    Py_XDECREF(exc);
9439
0
    Py_XDECREF(errorHandler);
9440
0
    return NULL;
9441
156
}
9442
9443
PyObject *
9444
PyUnicode_Translate(PyObject *str,
9445
                    PyObject *mapping,
9446
                    const char *errors)
9447
0
{
9448
0
    if (ensure_unicode(str) < 0)
9449
0
        return NULL;
9450
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9451
0
}
9452
9453
PyObject *
9454
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9455
13.2M
{
9456
13.2M
    if (!PyUnicode_Check(unicode)) {
9457
0
        PyErr_BadInternalCall();
9458
0
        return NULL;
9459
0
    }
9460
13.2M
    if (PyUnicode_IS_ASCII(unicode)) {
9461
        /* If the string is already ASCII, just return the same string */
9462
13.2M
        return Py_NewRef(unicode);
9463
13.2M
    }
9464
9465
2.40k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9466
2.40k
    PyObject *result = PyUnicode_New(len, 127);
9467
2.40k
    if (result == NULL) {
9468
0
        return NULL;
9469
0
    }
9470
9471
2.40k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9472
2.40k
    int kind = PyUnicode_KIND(unicode);
9473
2.40k
    const void *data = PyUnicode_DATA(unicode);
9474
2.40k
    Py_ssize_t i;
9475
87.6k
    for (i = 0; i < len; ++i) {
9476
85.4k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9477
85.4k
        if (ch < 127) {
9478
82.5k
            out[i] = ch;
9479
82.5k
        }
9480
2.84k
        else if (Py_UNICODE_ISSPACE(ch)) {
9481
1.27k
            out[i] = ' ';
9482
1.27k
        }
9483
1.56k
        else {
9484
1.56k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9485
1.56k
            if (decimal < 0) {
9486
132
                out[i] = '?';
9487
132
                out[i+1] = '\0';
9488
132
                _PyUnicode_LENGTH(result) = i + 1;
9489
132
                break;
9490
132
            }
9491
1.43k
            out[i] = '0' + decimal;
9492
1.43k
        }
9493
85.4k
    }
9494
9495
2.40k
    assert(_PyUnicode_CheckConsistency(result, 1));
9496
2.40k
    return result;
9497
2.40k
}
9498
9499
/* --- Helpers ------------------------------------------------------------ */
9500
9501
/* helper macro to fixup start/end slice values */
9502
#define ADJUST_INDICES(start, end, len) \
9503
116M
    do {                                \
9504
116M
        if (end > len) {                \
9505
88.3M
            end = len;                  \
9506
88.3M
        }                               \
9507
116M
        else if (end < 0) {             \
9508
0
            end += len;                 \
9509
0
            if (end < 0) {              \
9510
0
                end = 0;                \
9511
0
            }                           \
9512
0
        }                               \
9513
116M
        if (start < 0) {                \
9514
18.9k
            start += len;               \
9515
18.9k
            if (start < 0) {            \
9516
0
                start = 0;              \
9517
0
            }                           \
9518
18.9k
        }                               \
9519
116M
    } while (0)
9520
9521
static Py_ssize_t
9522
any_find_slice(PyObject* s1, PyObject* s2,
9523
               Py_ssize_t start,
9524
               Py_ssize_t end,
9525
               int direction)
9526
27.4M
{
9527
27.4M
    int kind1, kind2;
9528
27.4M
    const void *buf1, *buf2;
9529
27.4M
    Py_ssize_t len1, len2, result;
9530
9531
27.4M
    kind1 = PyUnicode_KIND(s1);
9532
27.4M
    kind2 = PyUnicode_KIND(s2);
9533
27.4M
    if (kind1 < kind2)
9534
0
        return -1;
9535
9536
27.4M
    len1 = PyUnicode_GET_LENGTH(s1);
9537
27.4M
    len2 = PyUnicode_GET_LENGTH(s2);
9538
27.4M
    ADJUST_INDICES(start, end, len1);
9539
27.4M
    if (end - start < len2)
9540
2.14M
        return -1;
9541
9542
25.2M
    buf1 = PyUnicode_DATA(s1);
9543
25.2M
    buf2 = PyUnicode_DATA(s2);
9544
25.2M
    if (len2 == 1) {
9545
24.4M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9546
24.4M
        result = findchar((const char *)buf1 + kind1*start,
9547
24.4M
                          kind1, end - start, ch, direction);
9548
24.4M
        if (result == -1)
9549
3.80M
            return -1;
9550
20.6M
        else
9551
20.6M
            return start + result;
9552
24.4M
    }
9553
9554
810k
    if (kind2 != kind1) {
9555
188k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9556
188k
        if (!buf2)
9557
0
            return -2;
9558
188k
    }
9559
9560
810k
    if (direction > 0) {
9561
810k
        switch (kind1) {
9562
622k
        case PyUnicode_1BYTE_KIND:
9563
622k
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9564
368k
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9565
254k
            else
9566
254k
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9567
622k
            break;
9568
67.6k
        case PyUnicode_2BYTE_KIND:
9569
67.6k
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9570
67.6k
            break;
9571
120k
        case PyUnicode_4BYTE_KIND:
9572
120k
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9573
120k
            break;
9574
0
        default:
9575
0
            Py_UNREACHABLE();
9576
810k
        }
9577
810k
    }
9578
0
    else {
9579
0
        switch (kind1) {
9580
0
        case PyUnicode_1BYTE_KIND:
9581
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9582
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9583
0
            else
9584
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9585
0
            break;
9586
0
        case PyUnicode_2BYTE_KIND:
9587
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9588
0
            break;
9589
0
        case PyUnicode_4BYTE_KIND:
9590
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9591
0
            break;
9592
0
        default:
9593
0
            Py_UNREACHABLE();
9594
0
        }
9595
0
    }
9596
9597
810k
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9598
810k
    if (kind2 != kind1)
9599
188k
        PyMem_Free((void *)buf2);
9600
9601
810k
    return result;
9602
810k
}
9603
9604
9605
Py_ssize_t
9606
PyUnicode_Count(PyObject *str,
9607
                PyObject *substr,
9608
                Py_ssize_t start,
9609
                Py_ssize_t end)
9610
0
{
9611
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9612
0
        return -1;
9613
9614
0
    return unicode_count_impl(str, substr, start, end);
9615
0
}
9616
9617
Py_ssize_t
9618
PyUnicode_Find(PyObject *str,
9619
               PyObject *substr,
9620
               Py_ssize_t start,
9621
               Py_ssize_t end,
9622
               int direction)
9623
0
{
9624
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9625
0
        return -2;
9626
9627
0
    return any_find_slice(str, substr, start, end, direction);
9628
0
}
9629
9630
Py_ssize_t
9631
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9632
                   Py_ssize_t start, Py_ssize_t end,
9633
                   int direction)
9634
3.81M
{
9635
3.81M
    int kind;
9636
3.81M
    Py_ssize_t len, result;
9637
3.81M
    len = PyUnicode_GET_LENGTH(str);
9638
3.81M
    ADJUST_INDICES(start, end, len);
9639
3.81M
    if (end - start < 1)
9640
0
        return -1;
9641
3.81M
    kind = PyUnicode_KIND(str);
9642
3.81M
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9643
3.81M
                      kind, end-start, ch, direction);
9644
3.81M
    if (result == -1)
9645
2.86M
        return -1;
9646
949k
    else
9647
949k
        return start + result;
9648
3.81M
}
9649
9650
static int
9651
tailmatch(PyObject *self,
9652
          PyObject *substring,
9653
          Py_ssize_t start,
9654
          Py_ssize_t end,
9655
          int direction)
9656
57.7M
{
9657
57.7M
    int kind_self;
9658
57.7M
    int kind_sub;
9659
57.7M
    const void *data_self;
9660
57.7M
    const void *data_sub;
9661
57.7M
    Py_ssize_t offset;
9662
57.7M
    Py_ssize_t i;
9663
57.7M
    Py_ssize_t end_sub;
9664
9665
57.7M
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9666
57.7M
    end -= PyUnicode_GET_LENGTH(substring);
9667
57.7M
    if (end < start)
9668
10.0M
        return 0;
9669
9670
47.7M
    if (PyUnicode_GET_LENGTH(substring) == 0)
9671
0
        return 1;
9672
9673
47.7M
    kind_self = PyUnicode_KIND(self);
9674
47.7M
    data_self = PyUnicode_DATA(self);
9675
47.7M
    kind_sub = PyUnicode_KIND(substring);
9676
47.7M
    data_sub = PyUnicode_DATA(substring);
9677
47.7M
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9678
9679
47.7M
    if (direction > 0)
9680
7.41M
        offset = end;
9681
40.3M
    else
9682
40.3M
        offset = start;
9683
9684
47.7M
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9685
47.7M
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9686
33.1M
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9687
33.1M
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9688
        /* If both are of the same kind, memcmp is sufficient */
9689
13.3M
        if (kind_self == kind_sub) {
9690
6.63M
            return ! memcmp((char *)data_self +
9691
6.63M
                                (offset * PyUnicode_KIND(substring)),
9692
6.63M
                            data_sub,
9693
6.63M
                            PyUnicode_GET_LENGTH(substring) *
9694
6.63M
                                PyUnicode_KIND(substring));
9695
6.63M
        }
9696
        /* otherwise we have to compare each character by first accessing it */
9697
6.66M
        else {
9698
            /* We do not need to compare 0 and len(substring)-1 because
9699
               the if statement above ensured already that they are equal
9700
               when we end up here. */
9701
6.81M
            for (i = 1; i < end_sub; ++i) {
9702
177k
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9703
177k
                    PyUnicode_READ(kind_sub, data_sub, i))
9704
26.5k
                    return 0;
9705
177k
            }
9706
6.63M
            return 1;
9707
6.66M
        }
9708
13.3M
    }
9709
9710
34.4M
    return 0;
9711
47.7M
}
9712
9713
Py_ssize_t
9714
PyUnicode_Tailmatch(PyObject *str,
9715
                    PyObject *substr,
9716
                    Py_ssize_t start,
9717
                    Py_ssize_t end,
9718
                    int direction)
9719
486
{
9720
486
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9721
0
        return -1;
9722
9723
486
    return tailmatch(str, substr, start, end, direction);
9724
486
}
9725
9726
static PyObject *
9727
ascii_upper_or_lower(PyObject *self, int lower)
9728
67.8M
{
9729
67.8M
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9730
67.8M
    const char *data = PyUnicode_DATA(self);
9731
67.8M
    char *resdata;
9732
67.8M
    PyObject *res;
9733
9734
67.8M
    res = PyUnicode_New(len, 127);
9735
67.8M
    if (res == NULL)
9736
0
        return NULL;
9737
67.8M
    resdata = PyUnicode_DATA(res);
9738
67.8M
    if (lower)
9739
67.8M
        _Py_bytes_lower(resdata, data, len);
9740
306
    else
9741
306
        _Py_bytes_upper(resdata, data, len);
9742
67.8M
    return res;
9743
67.8M
}
9744
9745
static Py_UCS4
9746
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9747
456k
{
9748
456k
    Py_ssize_t j;
9749
456k
    int final_sigma;
9750
456k
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9751
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9752
9753
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9754
9755
    where ! is a negation and \p{xxx} is a character with property xxx.
9756
    */
9757
840k
    for (j = i - 1; j >= 0; j--) {
9758
839k
        c = PyUnicode_READ(kind, data, j);
9759
839k
        if (!_PyUnicode_IsCaseIgnorable(c))
9760
454k
            break;
9761
839k
    }
9762
456k
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9763
456k
    if (final_sigma) {
9764
706k
        for (j = i + 1; j < length; j++) {
9765
703k
            c = PyUnicode_READ(kind, data, j);
9766
703k
            if (!_PyUnicode_IsCaseIgnorable(c))
9767
352k
                break;
9768
703k
        }
9769
355k
        final_sigma = j == length || !_PyUnicode_IsCased(c);
9770
355k
    }
9771
456k
    return (final_sigma) ? 0x3C2 : 0x3C3;
9772
456k
}
9773
9774
static int
9775
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9776
           Py_UCS4 c, Py_UCS4 *mapped)
9777
82.0M
{
9778
    /* Obscure special case. */
9779
82.0M
    if (c == 0x3A3) {
9780
456k
        mapped[0] = handle_capital_sigma(kind, data, length, i);
9781
456k
        return 1;
9782
456k
    }
9783
81.6M
    return _PyUnicode_ToLowerFull(c, mapped);
9784
82.0M
}
9785
9786
static Py_ssize_t
9787
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9788
0
{
9789
0
    Py_ssize_t i, k = 0;
9790
0
    int n_res, j;
9791
0
    Py_UCS4 c, mapped[3];
9792
9793
0
    c = PyUnicode_READ(kind, data, 0);
9794
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
9795
0
    for (j = 0; j < n_res; j++) {
9796
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
9797
0
        res[k++] = mapped[j];
9798
0
    }
9799
0
    for (i = 1; i < length; i++) {
9800
0
        c = PyUnicode_READ(kind, data, i);
9801
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9802
0
        for (j = 0; j < n_res; j++) {
9803
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9804
0
            res[k++] = mapped[j];
9805
0
        }
9806
0
    }
9807
0
    return k;
9808
0
}
9809
9810
static Py_ssize_t
9811
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9812
0
    Py_ssize_t i, k = 0;
9813
9814
0
    for (i = 0; i < length; i++) {
9815
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9816
0
        int n_res, j;
9817
0
        if (Py_UNICODE_ISUPPER(c)) {
9818
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9819
0
        }
9820
0
        else if (Py_UNICODE_ISLOWER(c)) {
9821
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9822
0
        }
9823
0
        else {
9824
0
            n_res = 1;
9825
0
            mapped[0] = c;
9826
0
        }
9827
0
        for (j = 0; j < n_res; j++) {
9828
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9829
0
            res[k++] = mapped[j];
9830
0
        }
9831
0
    }
9832
0
    return k;
9833
0
}
9834
9835
static Py_ssize_t
9836
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9837
                  Py_UCS4 *maxchar, int lower)
9838
4.83M
{
9839
4.83M
    Py_ssize_t i, k = 0;
9840
9841
86.9M
    for (i = 0; i < length; i++) {
9842
82.0M
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9843
82.0M
        int n_res, j;
9844
82.0M
        if (lower)
9845
82.0M
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9846
0
        else
9847
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9848
164M
        for (j = 0; j < n_res; j++) {
9849
82.0M
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9850
82.0M
            res[k++] = mapped[j];
9851
82.0M
        }
9852
82.0M
    }
9853
4.83M
    return k;
9854
4.83M
}
9855
9856
static Py_ssize_t
9857
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9858
0
{
9859
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9860
0
}
9861
9862
static Py_ssize_t
9863
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9864
4.83M
{
9865
4.83M
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9866
4.83M
}
9867
9868
static Py_ssize_t
9869
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9870
0
{
9871
0
    Py_ssize_t i, k = 0;
9872
9873
0
    for (i = 0; i < length; i++) {
9874
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9875
0
        Py_UCS4 mapped[3];
9876
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9877
0
        for (j = 0; j < n_res; j++) {
9878
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9879
0
            res[k++] = mapped[j];
9880
0
        }
9881
0
    }
9882
0
    return k;
9883
0
}
9884
9885
static Py_ssize_t
9886
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9887
0
{
9888
0
    Py_ssize_t i, k = 0;
9889
0
    int previous_is_cased;
9890
9891
0
    previous_is_cased = 0;
9892
0
    for (i = 0; i < length; i++) {
9893
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9894
0
        Py_UCS4 mapped[3];
9895
0
        int n_res, j;
9896
9897
0
        if (previous_is_cased)
9898
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9899
0
        else
9900
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
9901
9902
0
        for (j = 0; j < n_res; j++) {
9903
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9904
0
            res[k++] = mapped[j];
9905
0
        }
9906
9907
0
        previous_is_cased = _PyUnicode_IsCased(c);
9908
0
    }
9909
0
    return k;
9910
0
}
9911
9912
static PyObject *
9913
case_operation(PyObject *self,
9914
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9915
4.83M
{
9916
4.83M
    PyObject *res = NULL;
9917
4.83M
    Py_ssize_t length, newlength = 0;
9918
4.83M
    int kind, outkind;
9919
4.83M
    const void *data;
9920
4.83M
    void *outdata;
9921
4.83M
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9922
9923
4.83M
    kind = PyUnicode_KIND(self);
9924
4.83M
    data = PyUnicode_DATA(self);
9925
4.83M
    length = PyUnicode_GET_LENGTH(self);
9926
4.83M
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9927
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
9928
0
        return NULL;
9929
0
    }
9930
4.83M
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
9931
4.83M
    if (tmp == NULL)
9932
0
        return PyErr_NoMemory();
9933
4.83M
    newlength = perform(kind, data, length, tmp, &maxchar);
9934
4.83M
    res = PyUnicode_New(newlength, maxchar);
9935
4.83M
    if (res == NULL)
9936
0
        goto leave;
9937
4.83M
    tmpend = tmp + newlength;
9938
4.83M
    outdata = PyUnicode_DATA(res);
9939
4.83M
    outkind = PyUnicode_KIND(res);
9940
4.83M
    switch (outkind) {
9941
195k
    case PyUnicode_1BYTE_KIND:
9942
195k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9943
195k
        break;
9944
4.47M
    case PyUnicode_2BYTE_KIND:
9945
4.47M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9946
4.47M
        break;
9947
165k
    case PyUnicode_4BYTE_KIND:
9948
165k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9949
165k
        break;
9950
0
    default:
9951
0
        Py_UNREACHABLE();
9952
4.83M
    }
9953
4.83M
  leave:
9954
4.83M
    PyMem_Free(tmp);
9955
4.83M
    return res;
9956
4.83M
}
9957
9958
PyObject *
9959
PyUnicode_Join(PyObject *separator, PyObject *seq)
9960
24.9M
{
9961
24.9M
    PyObject *res;
9962
24.9M
    PyObject *fseq;
9963
24.9M
    Py_ssize_t seqlen;
9964
24.9M
    PyObject **items;
9965
9966
24.9M
    fseq = PySequence_Fast(seq, "can only join an iterable");
9967
24.9M
    if (fseq == NULL) {
9968
665
        return NULL;
9969
665
    }
9970
9971
24.9M
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
9972
9973
24.9M
    items = PySequence_Fast_ITEMS(fseq);
9974
24.9M
    seqlen = PySequence_Fast_GET_SIZE(fseq);
9975
24.9M
    res = _PyUnicode_JoinArray(separator, items, seqlen);
9976
9977
24.9M
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
9978
9979
24.9M
    Py_DECREF(fseq);
9980
24.9M
    return res;
9981
24.9M
}
9982
9983
PyObject *
9984
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9985
38.5M
{
9986
38.5M
    PyObject *res = NULL; /* the result */
9987
38.5M
    PyObject *sep = NULL;
9988
38.5M
    Py_ssize_t seplen;
9989
38.5M
    PyObject *item;
9990
38.5M
    Py_ssize_t sz, i, res_offset;
9991
38.5M
    Py_UCS4 maxchar;
9992
38.5M
    Py_UCS4 item_maxchar;
9993
38.5M
    int use_memcpy;
9994
38.5M
    unsigned char *res_data = NULL, *sep_data = NULL;
9995
38.5M
    PyObject *last_obj;
9996
38.5M
    int kind = 0;
9997
9998
    /* If empty sequence, return u"". */
9999
38.5M
    if (seqlen == 0) {
10000
7.20M
        _Py_RETURN_UNICODE_EMPTY();
10001
7.20M
    }
10002
10003
    /* If singleton sequence with an exact Unicode, return that. */
10004
31.3M
    last_obj = NULL;
10005
31.3M
    if (seqlen == 1) {
10006
10.6M
        if (PyUnicode_CheckExact(items[0])) {
10007
9.22M
            res = items[0];
10008
9.22M
            return Py_NewRef(res);
10009
9.22M
        }
10010
1.37M
        seplen = 0;
10011
1.37M
        maxchar = 0;
10012
1.37M
    }
10013
20.7M
    else {
10014
        /* Set up sep and seplen */
10015
20.7M
        if (separator == NULL) {
10016
            /* fall back to a blank space separator */
10017
0
            sep = PyUnicode_FromOrdinal(' ');
10018
0
            if (!sep)
10019
0
                goto onError;
10020
0
            seplen = 1;
10021
0
            maxchar = 32;
10022
0
        }
10023
20.7M
        else {
10024
20.7M
            if (!PyUnicode_Check(separator)) {
10025
0
                PyErr_Format(PyExc_TypeError,
10026
0
                             "separator: expected str instance,"
10027
0
                             " %.80s found",
10028
0
                             Py_TYPE(separator)->tp_name);
10029
0
                goto onError;
10030
0
            }
10031
20.7M
            sep = separator;
10032
20.7M
            seplen = PyUnicode_GET_LENGTH(separator);
10033
20.7M
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10034
            /* inc refcount to keep this code path symmetric with the
10035
               above case of a blank separator */
10036
20.7M
            Py_INCREF(sep);
10037
20.7M
        }
10038
20.7M
        last_obj = sep;
10039
20.7M
    }
10040
10041
    /* There are at least two things to join, or else we have a subclass
10042
     * of str in the sequence.
10043
     * Do a pre-pass to figure out the total amount of space we'll
10044
     * need (sz), and see whether all argument are strings.
10045
     */
10046
22.0M
    sz = 0;
10047
#ifdef Py_DEBUG
10048
    use_memcpy = 0;
10049
#else
10050
22.0M
    use_memcpy = 1;
10051
22.0M
#endif
10052
217M
    for (i = 0; i < seqlen; i++) {
10053
195M
        size_t add_sz;
10054
195M
        item = items[i];
10055
195M
        if (!PyUnicode_Check(item)) {
10056
0
            PyErr_Format(PyExc_TypeError,
10057
0
                         "sequence item %zd: expected str instance,"
10058
0
                         " %.80s found",
10059
0
                         i, Py_TYPE(item)->tp_name);
10060
0
            goto onError;
10061
0
        }
10062
195M
        add_sz = PyUnicode_GET_LENGTH(item);
10063
195M
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10064
195M
        maxchar = Py_MAX(maxchar, item_maxchar);
10065
195M
        if (i != 0) {
10066
173M
            add_sz += seplen;
10067
173M
        }
10068
195M
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10069
0
            PyErr_SetString(PyExc_OverflowError,
10070
0
                            "join() result is too long for a Python string");
10071
0
            goto onError;
10072
0
        }
10073
195M
        sz += add_sz;
10074
195M
        if (use_memcpy && last_obj != NULL) {
10075
118M
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10076
2.97M
                use_memcpy = 0;
10077
118M
        }
10078
195M
        last_obj = item;
10079
195M
    }
10080
10081
22.0M
    res = PyUnicode_New(sz, maxchar);
10082
22.0M
    if (res == NULL)
10083
0
        goto onError;
10084
10085
    /* Catenate everything. */
10086
#ifdef Py_DEBUG
10087
    use_memcpy = 0;
10088
#else
10089
22.0M
    if (use_memcpy) {
10090
19.1M
        res_data = PyUnicode_1BYTE_DATA(res);
10091
19.1M
        kind = PyUnicode_KIND(res);
10092
19.1M
        if (seplen != 0)
10093
241k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10094
19.1M
    }
10095
22.0M
#endif
10096
22.0M
    if (use_memcpy) {
10097
116M
        for (i = 0; i < seqlen; ++i) {
10098
97.5M
            Py_ssize_t itemlen;
10099
97.5M
            item = items[i];
10100
10101
            /* Copy item, and maybe the separator. */
10102
97.5M
            if (i && seplen != 0) {
10103
354k
                memcpy(res_data,
10104
354k
                          sep_data,
10105
354k
                          kind * seplen);
10106
354k
                res_data += kind * seplen;
10107
354k
            }
10108
10109
97.5M
            itemlen = PyUnicode_GET_LENGTH(item);
10110
97.5M
            if (itemlen != 0) {
10111
86.7M
                memcpy(res_data,
10112
86.7M
                          PyUnicode_DATA(item),
10113
86.7M
                          kind * itemlen);
10114
86.7M
                res_data += kind * itemlen;
10115
86.7M
            }
10116
97.5M
        }
10117
19.1M
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10118
19.1M
                           + kind * PyUnicode_GET_LENGTH(res));
10119
19.1M
    }
10120
2.97M
    else {
10121
101M
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10122
98.2M
            Py_ssize_t itemlen;
10123
98.2M
            item = items[i];
10124
10125
            /* Copy item, and maybe the separator. */
10126
98.2M
            if (i && seplen != 0) {
10127
1.29M
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10128
1.29M
                res_offset += seplen;
10129
1.29M
            }
10130
10131
98.2M
            itemlen = PyUnicode_GET_LENGTH(item);
10132
98.2M
            if (itemlen != 0) {
10133
97.5M
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10134
97.5M
                res_offset += itemlen;
10135
97.5M
            }
10136
98.2M
        }
10137
2.97M
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10138
2.97M
    }
10139
10140
22.0M
    Py_XDECREF(sep);
10141
22.0M
    assert(_PyUnicode_CheckConsistency(res, 1));
10142
22.0M
    return res;
10143
10144
0
  onError:
10145
0
    Py_XDECREF(sep);
10146
0
    Py_XDECREF(res);
10147
0
    return NULL;
10148
22.0M
}
10149
10150
void
10151
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10152
                    Py_UCS4 fill_char)
10153
652
{
10154
652
    const int kind = PyUnicode_KIND(unicode);
10155
652
    void *data = PyUnicode_DATA(unicode);
10156
652
    assert(_PyUnicode_IsModifiable(unicode));
10157
652
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10158
652
    assert(start >= 0);
10159
652
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10160
652
    _PyUnicode_Fill(kind, data, fill_char, start, length);
10161
652
}
10162
10163
Py_ssize_t
10164
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10165
               Py_UCS4 fill_char)
10166
652
{
10167
652
    Py_ssize_t maxlen;
10168
10169
652
    if (!PyUnicode_Check(unicode)) {
10170
0
        PyErr_BadInternalCall();
10171
0
        return -1;
10172
0
    }
10173
652
    if (unicode_check_modifiable(unicode))
10174
0
        return -1;
10175
10176
652
    if (start < 0) {
10177
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10178
0
        return -1;
10179
0
    }
10180
652
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10181
0
        PyErr_SetString(PyExc_ValueError,
10182
0
                         "fill character is bigger than "
10183
0
                         "the string maximum character");
10184
0
        return -1;
10185
0
    }
10186
10187
652
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10188
652
    length = Py_MIN(maxlen, length);
10189
652
    if (length <= 0)
10190
0
        return 0;
10191
10192
652
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10193
652
    return length;
10194
652
}
10195
10196
static PyObject *
10197
pad(PyObject *self,
10198
    Py_ssize_t left,
10199
    Py_ssize_t right,
10200
    Py_UCS4 fill)
10201
68
{
10202
68
    PyObject *u;
10203
68
    Py_UCS4 maxchar;
10204
68
    int kind;
10205
68
    void *data;
10206
10207
68
    if (left < 0)
10208
0
        left = 0;
10209
68
    if (right < 0)
10210
0
        right = 0;
10211
10212
68
    if (left == 0 && right == 0)
10213
0
        return unicode_result_unchanged(self);
10214
10215
68
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10216
68
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10217
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10218
0
        return NULL;
10219
0
    }
10220
68
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10221
68
    maxchar = Py_MAX(maxchar, fill);
10222
68
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10223
68
    if (!u)
10224
0
        return NULL;
10225
10226
68
    kind = PyUnicode_KIND(u);
10227
68
    data = PyUnicode_DATA(u);
10228
68
    if (left)
10229
0
        _PyUnicode_Fill(kind, data, fill, 0, left);
10230
68
    if (right)
10231
68
        _PyUnicode_Fill(kind, data, fill,
10232
68
                        left + _PyUnicode_LENGTH(self), right);
10233
68
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10234
68
    assert(_PyUnicode_CheckConsistency(u, 1));
10235
68
    return u;
10236
68
}
10237
10238
PyObject *
10239
PyUnicode_Splitlines(PyObject *string, int keepends)
10240
17.4k
{
10241
17.4k
    PyObject *list;
10242
10243
17.4k
    if (ensure_unicode(string) < 0)
10244
0
        return NULL;
10245
10246
17.4k
    switch (PyUnicode_KIND(string)) {
10247
4.99k
    case PyUnicode_1BYTE_KIND:
10248
4.99k
        if (PyUnicode_IS_ASCII(string))
10249
3.96k
            list = asciilib_splitlines(
10250
3.96k
                string, PyUnicode_1BYTE_DATA(string),
10251
3.96k
                PyUnicode_GET_LENGTH(string), keepends);
10252
1.02k
        else
10253
1.02k
            list = ucs1lib_splitlines(
10254
1.02k
                string, PyUnicode_1BYTE_DATA(string),
10255
1.02k
                PyUnicode_GET_LENGTH(string), keepends);
10256
4.99k
        break;
10257
8.97k
    case PyUnicode_2BYTE_KIND:
10258
8.97k
        list = ucs2lib_splitlines(
10259
8.97k
            string, PyUnicode_2BYTE_DATA(string),
10260
8.97k
            PyUnicode_GET_LENGTH(string), keepends);
10261
8.97k
        break;
10262
3.46k
    case PyUnicode_4BYTE_KIND:
10263
3.46k
        list = ucs4lib_splitlines(
10264
3.46k
            string, PyUnicode_4BYTE_DATA(string),
10265
3.46k
            PyUnicode_GET_LENGTH(string), keepends);
10266
3.46k
        break;
10267
0
    default:
10268
0
        Py_UNREACHABLE();
10269
17.4k
    }
10270
17.4k
    return list;
10271
17.4k
}
10272
10273
static PyObject *
10274
split(PyObject *self,
10275
      PyObject *substring,
10276
      Py_ssize_t maxcount)
10277
22.1M
{
10278
22.1M
    int kind1, kind2;
10279
22.1M
    const void *buf1, *buf2;
10280
22.1M
    Py_ssize_t len1, len2;
10281
22.1M
    PyObject* out;
10282
22.1M
    len1 = PyUnicode_GET_LENGTH(self);
10283
22.1M
    kind1 = PyUnicode_KIND(self);
10284
10285
22.1M
    if (substring == NULL) {
10286
175k
        if (maxcount < 0) {
10287
151k
            maxcount = (len1 - 1) / 2 + 1;
10288
151k
        }
10289
175k
        switch (kind1) {
10290
110k
        case PyUnicode_1BYTE_KIND:
10291
110k
            if (PyUnicode_IS_ASCII(self))
10292
84.4k
                return asciilib_split_whitespace(
10293
84.4k
                    self,  PyUnicode_1BYTE_DATA(self),
10294
84.4k
                    len1, maxcount
10295
84.4k
                    );
10296
25.7k
            else
10297
25.7k
                return ucs1lib_split_whitespace(
10298
25.7k
                    self,  PyUnicode_1BYTE_DATA(self),
10299
25.7k
                    len1, maxcount
10300
25.7k
                    );
10301
55.0k
        case PyUnicode_2BYTE_KIND:
10302
55.0k
            return ucs2lib_split_whitespace(
10303
55.0k
                self,  PyUnicode_2BYTE_DATA(self),
10304
55.0k
                len1, maxcount
10305
55.0k
                );
10306
9.82k
        case PyUnicode_4BYTE_KIND:
10307
9.82k
            return ucs4lib_split_whitespace(
10308
9.82k
                self,  PyUnicode_4BYTE_DATA(self),
10309
9.82k
                len1, maxcount
10310
9.82k
                );
10311
0
        default:
10312
0
            Py_UNREACHABLE();
10313
175k
        }
10314
175k
    }
10315
10316
21.9M
    kind2 = PyUnicode_KIND(substring);
10317
21.9M
    len2 = PyUnicode_GET_LENGTH(substring);
10318
21.9M
    if (maxcount < 0) {
10319
        // if len2 == 0, it will raise ValueError.
10320
15.7M
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10321
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10322
15.7M
        maxcount = maxcount < 0 ? len1 : maxcount;
10323
15.7M
    }
10324
21.9M
    if (kind1 < kind2 || len1 < len2) {
10325
1.84M
        out = PyList_New(1);
10326
1.84M
        if (out == NULL)
10327
0
            return NULL;
10328
1.84M
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10329
1.84M
        return out;
10330
1.84M
    }
10331
20.1M
    buf1 = PyUnicode_DATA(self);
10332
20.1M
    buf2 = PyUnicode_DATA(substring);
10333
20.1M
    if (kind2 != kind1) {
10334
250k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10335
250k
        if (!buf2)
10336
0
            return NULL;
10337
250k
    }
10338
10339
20.1M
    switch (kind1) {
10340
19.8M
    case PyUnicode_1BYTE_KIND:
10341
19.8M
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10342
18.6M
            out = asciilib_split(
10343
18.6M
                self,  buf1, len1, buf2, len2, maxcount);
10344
1.16M
        else
10345
1.16M
            out = ucs1lib_split(
10346
1.16M
                self,  buf1, len1, buf2, len2, maxcount);
10347
19.8M
        break;
10348
216k
    case PyUnicode_2BYTE_KIND:
10349
216k
        out = ucs2lib_split(
10350
216k
            self,  buf1, len1, buf2, len2, maxcount);
10351
216k
        break;
10352
34.5k
    case PyUnicode_4BYTE_KIND:
10353
34.5k
        out = ucs4lib_split(
10354
34.5k
            self,  buf1, len1, buf2, len2, maxcount);
10355
34.5k
        break;
10356
0
    default:
10357
0
        out = NULL;
10358
20.1M
    }
10359
20.1M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10360
20.1M
    if (kind2 != kind1)
10361
250k
        PyMem_Free((void *)buf2);
10362
20.1M
    return out;
10363
20.1M
}
10364
10365
static PyObject *
10366
rsplit(PyObject *self,
10367
       PyObject *substring,
10368
       Py_ssize_t maxcount)
10369
66
{
10370
66
    int kind1, kind2;
10371
66
    const void *buf1, *buf2;
10372
66
    Py_ssize_t len1, len2;
10373
66
    PyObject* out;
10374
10375
66
    len1 = PyUnicode_GET_LENGTH(self);
10376
66
    kind1 = PyUnicode_KIND(self);
10377
10378
66
    if (substring == NULL) {
10379
0
        if (maxcount < 0) {
10380
0
            maxcount = (len1 - 1) / 2 + 1;
10381
0
        }
10382
0
        switch (kind1) {
10383
0
        case PyUnicode_1BYTE_KIND:
10384
0
            if (PyUnicode_IS_ASCII(self))
10385
0
                return asciilib_rsplit_whitespace(
10386
0
                    self,  PyUnicode_1BYTE_DATA(self),
10387
0
                    len1, maxcount
10388
0
                    );
10389
0
            else
10390
0
                return ucs1lib_rsplit_whitespace(
10391
0
                    self,  PyUnicode_1BYTE_DATA(self),
10392
0
                    len1, maxcount
10393
0
                    );
10394
0
        case PyUnicode_2BYTE_KIND:
10395
0
            return ucs2lib_rsplit_whitespace(
10396
0
                self,  PyUnicode_2BYTE_DATA(self),
10397
0
                len1, maxcount
10398
0
                );
10399
0
        case PyUnicode_4BYTE_KIND:
10400
0
            return ucs4lib_rsplit_whitespace(
10401
0
                self,  PyUnicode_4BYTE_DATA(self),
10402
0
                len1, maxcount
10403
0
                );
10404
0
        default:
10405
0
            Py_UNREACHABLE();
10406
0
        }
10407
0
    }
10408
66
    kind2 = PyUnicode_KIND(substring);
10409
66
    len2 = PyUnicode_GET_LENGTH(substring);
10410
66
    if (maxcount < 0) {
10411
        // if len2 == 0, it will raise ValueError.
10412
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10413
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10414
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10415
0
    }
10416
66
    if (kind1 < kind2 || len1 < len2) {
10417
0
        out = PyList_New(1);
10418
0
        if (out == NULL)
10419
0
            return NULL;
10420
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10421
0
        return out;
10422
0
    }
10423
66
    buf1 = PyUnicode_DATA(self);
10424
66
    buf2 = PyUnicode_DATA(substring);
10425
66
    if (kind2 != kind1) {
10426
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10427
0
        if (!buf2)
10428
0
            return NULL;
10429
0
    }
10430
10431
66
    switch (kind1) {
10432
66
    case PyUnicode_1BYTE_KIND:
10433
66
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10434
66
            out = asciilib_rsplit(
10435
66
                self,  buf1, len1, buf2, len2, maxcount);
10436
0
        else
10437
0
            out = ucs1lib_rsplit(
10438
0
                self,  buf1, len1, buf2, len2, maxcount);
10439
66
        break;
10440
0
    case PyUnicode_2BYTE_KIND:
10441
0
        out = ucs2lib_rsplit(
10442
0
            self,  buf1, len1, buf2, len2, maxcount);
10443
0
        break;
10444
0
    case PyUnicode_4BYTE_KIND:
10445
0
        out = ucs4lib_rsplit(
10446
0
            self,  buf1, len1, buf2, len2, maxcount);
10447
0
        break;
10448
0
    default:
10449
0
        out = NULL;
10450
66
    }
10451
66
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10452
66
    if (kind2 != kind1)
10453
0
        PyMem_Free((void *)buf2);
10454
66
    return out;
10455
66
}
10456
10457
static Py_ssize_t
10458
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10459
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10460
23.3M
{
10461
23.3M
    switch (kind) {
10462
8.64M
    case PyUnicode_1BYTE_KIND:
10463
8.64M
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10464
4.31M
            return asciilib_find(buf1, len1, buf2, len2, offset);
10465
4.33M
        else
10466
4.33M
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10467
6.77M
    case PyUnicode_2BYTE_KIND:
10468
6.77M
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10469
7.92M
    case PyUnicode_4BYTE_KIND:
10470
7.92M
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10471
23.3M
    }
10472
23.3M
    Py_UNREACHABLE();
10473
23.3M
}
10474
10475
static Py_ssize_t
10476
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10477
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10478
11.6M
{
10479
11.6M
    switch (kind) {
10480
10.8M
    case PyUnicode_1BYTE_KIND:
10481
10.8M
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10482
715k
    case PyUnicode_2BYTE_KIND:
10483
715k
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10484
84.2k
    case PyUnicode_4BYTE_KIND:
10485
84.2k
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10486
11.6M
    }
10487
11.6M
    Py_UNREACHABLE();
10488
11.6M
}
10489
10490
static void
10491
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10492
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10493
61.0k
{
10494
61.0k
    int kind = PyUnicode_KIND(u);
10495
61.0k
    void *data = PyUnicode_DATA(u);
10496
61.0k
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10497
61.0k
    if (kind == PyUnicode_1BYTE_KIND) {
10498
36.1k
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10499
36.1k
                                      (Py_UCS1 *)data + len,
10500
36.1k
                                      u1, u2, maxcount);
10501
36.1k
    }
10502
24.8k
    else if (kind == PyUnicode_2BYTE_KIND) {
10503
20.2k
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10504
20.2k
                                      (Py_UCS2 *)data + len,
10505
20.2k
                                      u1, u2, maxcount);
10506
20.2k
    }
10507
4.60k
    else {
10508
4.60k
        assert(kind == PyUnicode_4BYTE_KIND);
10509
4.60k
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10510
4.60k
                                      (Py_UCS4 *)data + len,
10511
4.60k
                                      u1, u2, maxcount);
10512
4.60k
    }
10513
61.0k
}
10514
10515
static PyObject *
10516
replace(PyObject *self, PyObject *str1,
10517
        PyObject *str2, Py_ssize_t maxcount)
10518
20.8M
{
10519
20.8M
    PyObject *u;
10520
20.8M
    const char *sbuf = PyUnicode_DATA(self);
10521
20.8M
    const void *buf1 = PyUnicode_DATA(str1);
10522
20.8M
    const void *buf2 = PyUnicode_DATA(str2);
10523
20.8M
    int srelease = 0, release1 = 0, release2 = 0;
10524
20.8M
    int skind = PyUnicode_KIND(self);
10525
20.8M
    int kind1 = PyUnicode_KIND(str1);
10526
20.8M
    int kind2 = PyUnicode_KIND(str2);
10527
20.8M
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10528
20.8M
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10529
20.8M
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10530
20.8M
    int mayshrink;
10531
20.8M
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10532
10533
20.8M
    if (slen < len1)
10534
8.64M
        goto nothing;
10535
10536
12.2M
    if (maxcount < 0)
10537
12.2M
        maxcount = PY_SSIZE_T_MAX;
10538
0
    else if (maxcount == 0)
10539
0
        goto nothing;
10540
10541
12.2M
    if (str1 == str2)
10542
0
        goto nothing;
10543
10544
12.2M
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10545
12.2M
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10546
12.2M
    if (maxchar < maxchar_str1)
10547
        /* substring too wide to be present */
10548
0
        goto nothing;
10549
12.2M
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10550
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10551
       result string. */
10552
12.2M
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10553
12.2M
    maxchar = Py_MAX(maxchar, maxchar_str2);
10554
10555
12.2M
    if (len1 == len2) {
10556
        /* same length */
10557
503k
        if (len1 == 0)
10558
0
            goto nothing;
10559
503k
        if (len1 == 1) {
10560
            /* replace characters */
10561
495k
            Py_UCS4 u1, u2;
10562
495k
            Py_ssize_t pos;
10563
10564
495k
            u1 = PyUnicode_READ(kind1, buf1, 0);
10565
495k
            pos = findchar(sbuf, skind, slen, u1, 1);
10566
495k
            if (pos < 0)
10567
433k
                goto nothing;
10568
61.0k
            u2 = PyUnicode_READ(kind2, buf2, 0);
10569
61.0k
            u = PyUnicode_New(slen, maxchar);
10570
61.0k
            if (!u)
10571
0
                goto error;
10572
10573
61.0k
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10574
61.0k
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10575
61.0k
        }
10576
8.17k
        else {
10577
8.17k
            int rkind = skind;
10578
8.17k
            char *res;
10579
8.17k
            Py_ssize_t i;
10580
10581
8.17k
            if (kind1 < rkind) {
10582
                /* widen substring */
10583
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10584
0
                if (!buf1) goto error;
10585
0
                release1 = 1;
10586
0
            }
10587
8.17k
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10588
8.17k
            if (i < 0)
10589
8.17k
                goto nothing;
10590
0
            if (rkind > kind2) {
10591
                /* widen replacement */
10592
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10593
0
                if (!buf2) goto error;
10594
0
                release2 = 1;
10595
0
            }
10596
0
            else if (rkind < kind2) {
10597
                /* widen self and buf1 */
10598
0
                rkind = kind2;
10599
0
                if (release1) {
10600
0
                    assert(buf1 != PyUnicode_DATA(str1));
10601
0
                    PyMem_Free((void *)buf1);
10602
0
                    buf1 = PyUnicode_DATA(str1);
10603
0
                    release1 = 0;
10604
0
                }
10605
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10606
0
                if (!sbuf) goto error;
10607
0
                srelease = 1;
10608
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10609
0
                if (!buf1) goto error;
10610
0
                release1 = 1;
10611
0
            }
10612
0
            u = PyUnicode_New(slen, maxchar);
10613
0
            if (!u)
10614
0
                goto error;
10615
0
            assert(PyUnicode_KIND(u) == rkind);
10616
0
            res = PyUnicode_DATA(u);
10617
10618
0
            memcpy(res, sbuf, rkind * slen);
10619
            /* change everything in-place, starting with this one */
10620
0
            memcpy(res + rkind * i,
10621
0
                   buf2,
10622
0
                   rkind * len2);
10623
0
            i += len1;
10624
10625
0
            while ( --maxcount > 0) {
10626
0
                i = anylib_find(rkind, self,
10627
0
                                sbuf+rkind*i, slen-i,
10628
0
                                str1, buf1, len1, i);
10629
0
                if (i == -1)
10630
0
                    break;
10631
0
                memcpy(res + rkind * i,
10632
0
                       buf2,
10633
0
                       rkind * len2);
10634
0
                i += len1;
10635
0
            }
10636
0
        }
10637
503k
    }
10638
11.6M
    else {
10639
11.6M
        Py_ssize_t n, i, j, ires;
10640
11.6M
        Py_ssize_t new_size;
10641
11.6M
        int rkind = skind;
10642
11.6M
        char *res;
10643
10644
11.6M
        if (kind1 < rkind) {
10645
            /* widen substring */
10646
799k
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10647
799k
            if (!buf1) goto error;
10648
799k
            release1 = 1;
10649
799k
        }
10650
11.6M
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10651
11.6M
        if (n == 0)
10652
10.3M
            goto nothing;
10653
1.36M
        if (kind2 < rkind) {
10654
            /* widen replacement */
10655
42.8k
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10656
42.8k
            if (!buf2) goto error;
10657
42.8k
            release2 = 1;
10658
42.8k
        }
10659
1.32M
        else if (kind2 > rkind) {
10660
            /* widen self and buf1 */
10661
0
            rkind = kind2;
10662
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10663
0
            if (!sbuf) goto error;
10664
0
            srelease = 1;
10665
0
            if (release1) {
10666
0
                assert(buf1 != PyUnicode_DATA(str1));
10667
0
                PyMem_Free((void *)buf1);
10668
0
                buf1 = PyUnicode_DATA(str1);
10669
0
                release1 = 0;
10670
0
            }
10671
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10672
0
            if (!buf1) goto error;
10673
0
            release1 = 1;
10674
0
        }
10675
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10676
           PyUnicode_GET_LENGTH(str1)); */
10677
1.36M
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10678
0
                PyErr_SetString(PyExc_OverflowError,
10679
0
                                "replace string is too long");
10680
0
                goto error;
10681
0
        }
10682
1.36M
        new_size = slen + n * (len2 - len1);
10683
1.36M
        if (new_size == 0) {
10684
0
            u = _PyUnicode_GetEmpty();
10685
0
            goto done;
10686
0
        }
10687
1.36M
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10688
0
            PyErr_SetString(PyExc_OverflowError,
10689
0
                            "replace string is too long");
10690
0
            goto error;
10691
0
        }
10692
1.36M
        u = PyUnicode_New(new_size, maxchar);
10693
1.36M
        if (!u)
10694
0
            goto error;
10695
1.36M
        assert(PyUnicode_KIND(u) == rkind);
10696
1.36M
        res = PyUnicode_DATA(u);
10697
1.36M
        ires = i = 0;
10698
1.36M
        if (len1 > 0) {
10699
24.7M
            while (n-- > 0) {
10700
                /* look for next match */
10701
23.3M
                j = anylib_find(rkind, self,
10702
23.3M
                                sbuf + rkind * i, slen-i,
10703
23.3M
                                str1, buf1, len1, i);
10704
23.3M
                if (j == -1)
10705
0
                    break;
10706
23.3M
                else if (j > i) {
10707
                    /* copy unchanged part [i:j] */
10708
4.80M
                    memcpy(res + rkind * ires,
10709
4.80M
                           sbuf + rkind * i,
10710
4.80M
                           rkind * (j-i));
10711
4.80M
                    ires += j - i;
10712
4.80M
                }
10713
                /* copy substitution string */
10714
23.3M
                if (len2 > 0) {
10715
23.3M
                    memcpy(res + rkind * ires,
10716
23.3M
                           buf2,
10717
23.3M
                           rkind * len2);
10718
23.3M
                    ires += len2;
10719
23.3M
                }
10720
23.3M
                i = j + len1;
10721
23.3M
            }
10722
1.36M
            if (i < slen)
10723
                /* copy tail [i:] */
10724
1.36M
                memcpy(res + rkind * ires,
10725
1.36M
                       sbuf + rkind * i,
10726
1.36M
                       rkind * (slen-i));
10727
1.36M
        }
10728
0
        else {
10729
            /* interleave */
10730
0
            while (n > 0) {
10731
0
                memcpy(res + rkind * ires,
10732
0
                       buf2,
10733
0
                       rkind * len2);
10734
0
                ires += len2;
10735
0
                if (--n <= 0)
10736
0
                    break;
10737
0
                memcpy(res + rkind * ires,
10738
0
                       sbuf + rkind * i,
10739
0
                       rkind);
10740
0
                ires++;
10741
0
                i++;
10742
0
            }
10743
0
            memcpy(res + rkind * ires,
10744
0
                   sbuf + rkind * i,
10745
0
                   rkind * (slen-i));
10746
0
        }
10747
1.36M
    }
10748
10749
1.42M
    if (mayshrink) {
10750
0
        unicode_adjust_maxchar(&u);
10751
0
        if (u == NULL)
10752
0
            goto error;
10753
0
    }
10754
10755
1.42M
  done:
10756
1.42M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10757
1.42M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10758
1.42M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10759
1.42M
    if (srelease)
10760
0
        PyMem_Free((void *)sbuf);
10761
1.42M
    if (release1)
10762
42.8k
        PyMem_Free((void *)buf1);
10763
1.42M
    if (release2)
10764
42.8k
        PyMem_Free((void *)buf2);
10765
1.42M
    assert(_PyUnicode_CheckConsistency(u, 1));
10766
1.42M
    return u;
10767
10768
19.4M
  nothing:
10769
    /* nothing to replace; return original string (when possible) */
10770
19.4M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10771
19.4M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10772
19.4M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10773
19.4M
    if (srelease)
10774
0
        PyMem_Free((void *)sbuf);
10775
19.4M
    if (release1)
10776
756k
        PyMem_Free((void *)buf1);
10777
19.4M
    if (release2)
10778
0
        PyMem_Free((void *)buf2);
10779
19.4M
    return unicode_result_unchanged(self);
10780
10781
0
  error:
10782
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10783
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10784
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10785
0
    if (srelease)
10786
0
        PyMem_Free((void *)sbuf);
10787
0
    if (release1)
10788
0
        PyMem_Free((void *)buf1);
10789
0
    if (release2)
10790
0
        PyMem_Free((void *)buf2);
10791
0
    return NULL;
10792
1.42M
}
10793
10794
/* --- Unicode Object Methods --------------------------------------------- */
10795
10796
/*[clinic input]
10797
str.title as unicode_title
10798
10799
Return a version of the string where each word is titlecased.
10800
10801
More specifically, words start with uppercased characters and all
10802
remaining cased characters have lower case.
10803
[clinic start generated code]*/
10804
10805
static PyObject *
10806
unicode_title_impl(PyObject *self)
10807
/*[clinic end generated code: output=c75ae03809574902 input=2a07e2c7df94627a]*/
10808
0
{
10809
0
    return case_operation(self, do_title);
10810
0
}
10811
10812
/*[clinic input]
10813
str.capitalize as unicode_capitalize
10814
10815
Return a capitalized version of the string.
10816
10817
More specifically, make the first character have upper case and the
10818
rest lower case.
10819
[clinic start generated code]*/
10820
10821
static PyObject *
10822
unicode_capitalize_impl(PyObject *self)
10823
/*[clinic end generated code: output=e49a4c333cdb7667 input=e50e50ed45a654cf]*/
10824
0
{
10825
0
    if (PyUnicode_GET_LENGTH(self) == 0)
10826
0
        return unicode_result_unchanged(self);
10827
0
    return case_operation(self, do_capitalize);
10828
0
}
10829
10830
/*[clinic input]
10831
str.casefold as unicode_casefold
10832
10833
Return a version of the string suitable for caseless comparisons.
10834
[clinic start generated code]*/
10835
10836
static PyObject *
10837
unicode_casefold_impl(PyObject *self)
10838
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10839
0
{
10840
0
    if (PyUnicode_IS_ASCII(self))
10841
0
        return ascii_upper_or_lower(self, 1);
10842
0
    return case_operation(self, do_casefold);
10843
0
}
10844
10845
10846
/* Argument converter. Accepts a single Unicode character. */
10847
10848
static int
10849
convert_uc(PyObject *obj, void *addr)
10850
130
{
10851
130
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10852
10853
130
    if (!PyUnicode_Check(obj)) {
10854
0
        PyErr_Format(PyExc_TypeError,
10855
0
                     "The fill character must be a unicode character, "
10856
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
10857
0
        return 0;
10858
0
    }
10859
130
    if (PyUnicode_GET_LENGTH(obj) != 1) {
10860
0
        PyErr_SetString(PyExc_TypeError,
10861
0
                        "The fill character must be exactly one character long");
10862
0
        return 0;
10863
0
    }
10864
130
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10865
130
    return 1;
10866
130
}
10867
10868
/*[clinic input]
10869
str.center as unicode_center
10870
10871
    width: Py_ssize_t
10872
    fillchar: Py_UCS4 = ' '
10873
    /
10874
10875
Return a centered string of length width.
10876
10877
Padding is done using the specified fill character (default is
10878
a space).
10879
[clinic start generated code]*/
10880
10881
static PyObject *
10882
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10883
/*[clinic end generated code: output=420c8859effc7c0c input=df91017dfd186a78]*/
10884
0
{
10885
0
    Py_ssize_t marg, left;
10886
10887
0
    if (PyUnicode_GET_LENGTH(self) >= width)
10888
0
        return unicode_result_unchanged(self);
10889
10890
0
    marg = width - PyUnicode_GET_LENGTH(self);
10891
0
    left = marg / 2 + (marg & width & 1);
10892
10893
0
    return pad(self, left, marg - left, fillchar);
10894
0
}
10895
10896
/* This function assumes that str1 and str2 are readied by the caller. */
10897
10898
static int
10899
unicode_compare(PyObject *str1, PyObject *str2)
10900
30.8M
{
10901
30.8M
#define COMPARE(TYPE1, TYPE2) \
10902
30.8M
    do { \
10903
28.0M
        TYPE1* p1 = (TYPE1 *)data1; \
10904
28.0M
        TYPE2* p2 = (TYPE2 *)data2; \
10905
28.0M
        TYPE1* end = p1 + len; \
10906
28.0M
        Py_UCS4 c1, c2; \
10907
28.0M
        for (; p1 != end; p1++, p2++) { \
10908
28.0M
            c1 = *p1; \
10909
28.0M
            c2 = *p2; \
10910
28.0M
            if (c1 != c2) \
10911
28.0M
                return (c1 < c2) ? -1 : 1; \
10912
28.0M
        } \
10913
28.0M
    } \
10914
28.0M
    while (0)
10915
10916
30.8M
    int kind1, kind2;
10917
30.8M
    const void *data1, *data2;
10918
30.8M
    Py_ssize_t len1, len2, len;
10919
10920
30.8M
    kind1 = PyUnicode_KIND(str1);
10921
30.8M
    kind2 = PyUnicode_KIND(str2);
10922
30.8M
    data1 = PyUnicode_DATA(str1);
10923
30.8M
    data2 = PyUnicode_DATA(str2);
10924
30.8M
    len1 = PyUnicode_GET_LENGTH(str1);
10925
30.8M
    len2 = PyUnicode_GET_LENGTH(str2);
10926
30.8M
    len = Py_MIN(len1, len2);
10927
10928
30.8M
    switch(kind1) {
10929
4.40M
    case PyUnicode_1BYTE_KIND:
10930
4.40M
    {
10931
4.40M
        switch(kind2) {
10932
440k
        case PyUnicode_1BYTE_KIND:
10933
440k
        {
10934
440k
            int cmp = memcmp(data1, data2, len);
10935
            /* normalize result of memcmp() into the range [-1; 1] */
10936
440k
            if (cmp < 0)
10937
376k
                return -1;
10938
64.4k
            if (cmp > 0)
10939
57.8k
                return 1;
10940
6.61k
            break;
10941
64.4k
        }
10942
3.46M
        case PyUnicode_2BYTE_KIND:
10943
3.46M
            COMPARE(Py_UCS1, Py_UCS2);
10944
0
            break;
10945
496k
        case PyUnicode_4BYTE_KIND:
10946
496k
            COMPARE(Py_UCS1, Py_UCS4);
10947
0
            break;
10948
0
        default:
10949
0
            Py_UNREACHABLE();
10950
4.40M
        }
10951
6.61k
        break;
10952
4.40M
    }
10953
23.7M
    case PyUnicode_2BYTE_KIND:
10954
23.7M
    {
10955
23.7M
        switch(kind2) {
10956
86.8k
        case PyUnicode_1BYTE_KIND:
10957
86.8k
            COMPARE(Py_UCS2, Py_UCS1);
10958
0
            break;
10959
23.1M
        case PyUnicode_2BYTE_KIND:
10960
23.1M
        {
10961
23.1M
            COMPARE(Py_UCS2, Py_UCS2);
10962
0
            break;
10963
23.1M
        }
10964
484k
        case PyUnicode_4BYTE_KIND:
10965
484k
            COMPARE(Py_UCS2, Py_UCS4);
10966
0
            break;
10967
0
        default:
10968
0
            Py_UNREACHABLE();
10969
23.7M
        }
10970
0
        break;
10971
23.7M
    }
10972
2.65M
    case PyUnicode_4BYTE_KIND:
10973
2.65M
    {
10974
2.65M
        switch(kind2) {
10975
8.34k
        case PyUnicode_1BYTE_KIND:
10976
8.34k
            COMPARE(Py_UCS4, Py_UCS1);
10977
0
            break;
10978
352k
        case PyUnicode_2BYTE_KIND:
10979
352k
            COMPARE(Py_UCS4, Py_UCS2);
10980
0
            break;
10981
2.29M
        case PyUnicode_4BYTE_KIND:
10982
2.29M
        {
10983
2.29M
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10984
2.29M
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10985
            /* normalize result of wmemcmp() into the range [-1; 1] */
10986
2.29M
            if (cmp < 0)
10987
1.13M
                return -1;
10988
1.16M
            if (cmp > 0)
10989
1.16M
                return 1;
10990
#else
10991
            COMPARE(Py_UCS4, Py_UCS4);
10992
#endif
10993
0
            break;
10994
1.16M
        }
10995
0
        default:
10996
0
            Py_UNREACHABLE();
10997
2.65M
        }
10998
0
        break;
10999
2.65M
    }
11000
0
    default:
11001
0
        Py_UNREACHABLE();
11002
30.8M
    }
11003
11004
6.61k
    if (len1 == len2)
11005
6.57k
        return 0;
11006
37
    if (len1 < len2)
11007
14
        return -1;
11008
23
    else
11009
23
        return 1;
11010
11011
37
#undef COMPARE
11012
37
}
11013
11014
11015
int
11016
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11017
611M
{
11018
611M
    assert(PyUnicode_Check(str1));
11019
611M
    assert(PyUnicode_Check(str2));
11020
611M
    if (str1 == str2) {
11021
80.7M
        return 1;
11022
80.7M
    }
11023
530M
    return unicode_eq(str1, str2);
11024
611M
}
11025
11026
11027
int
11028
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11029
0
{
11030
0
    if (!PyUnicode_Check(str1)) {
11031
0
        PyErr_Format(PyExc_TypeError,
11032
0
                     "first argument must be str, not %T", str1);
11033
0
        return -1;
11034
0
    }
11035
0
    if (!PyUnicode_Check(str2)) {
11036
0
        PyErr_Format(PyExc_TypeError,
11037
0
                     "second argument must be str, not %T", str2);
11038
0
        return -1;
11039
0
    }
11040
11041
0
    return _PyUnicode_Equal(str1, str2);
11042
0
}
11043
11044
11045
int
11046
PyUnicode_Compare(PyObject *left, PyObject *right)
11047
277k
{
11048
277k
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11049
        /* a string is equal to itself */
11050
277k
        if (left == right)
11051
0
            return 0;
11052
11053
277k
        return unicode_compare(left, right);
11054
277k
    }
11055
0
    PyErr_Format(PyExc_TypeError,
11056
0
                 "Can't compare %.100s and %.100s",
11057
0
                 Py_TYPE(left)->tp_name,
11058
0
                 Py_TYPE(right)->tp_name);
11059
0
    return -1;
11060
277k
}
11061
11062
int
11063
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11064
11.0M
{
11065
11.0M
    Py_ssize_t i;
11066
11.0M
    int kind;
11067
11.0M
    Py_UCS4 chr;
11068
11069
11.0M
    assert(_PyUnicode_CHECK(uni));
11070
11.0M
    kind = PyUnicode_KIND(uni);
11071
11.0M
    if (kind == PyUnicode_1BYTE_KIND) {
11072
11.0M
        const void *data = PyUnicode_1BYTE_DATA(uni);
11073
11.0M
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11074
11.0M
        size_t len, len2 = strlen(str);
11075
11.0M
        int cmp;
11076
11077
11.0M
        len = Py_MIN(len1, len2);
11078
11.0M
        cmp = memcmp(data, str, len);
11079
11.0M
        if (cmp != 0) {
11080
6.84M
            if (cmp < 0)
11081
53.8k
                return -1;
11082
6.78M
            else
11083
6.78M
                return 1;
11084
6.84M
        }
11085
4.16M
        if (len1 > len2)
11086
205
            return 1; /* uni is longer */
11087
4.16M
        if (len1 < len2)
11088
943
            return -1; /* str is longer */
11089
4.16M
        return 0;
11090
4.16M
    }
11091
1.27k
    else {
11092
1.27k
        const void *data = PyUnicode_DATA(uni);
11093
        /* Compare Unicode string and source character set string */
11094
1.83k
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11095
1.76k
            if (chr != (unsigned char)str[i])
11096
1.20k
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11097
        /* This check keeps Python strings that end in '\0' from comparing equal
11098
         to C strings identical up to that point. */
11099
71
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11100
71
            return 1; /* uni is longer */
11101
0
        if (str[i])
11102
0
            return -1; /* str is longer */
11103
0
        return 0;
11104
0
    }
11105
11.0M
}
11106
11107
int
11108
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11109
24
{
11110
24
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11111
24
}
11112
11113
int
11114
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11115
24
{
11116
24
    assert(_PyUnicode_CHECK(unicode));
11117
24
    assert(str);
11118
11119
24
    if (PyUnicode_IS_ASCII(unicode)) {
11120
24
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11121
24
        return size == len &&
11122
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11123
24
    }
11124
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11125
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11126
0
        return size == len &&
11127
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11128
0
    }
11129
11130
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11131
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11132
0
        return 0;
11133
0
    }
11134
0
    const unsigned char *s = (const unsigned char *)str;
11135
0
    const unsigned char *ends = s + (size_t)size;
11136
0
    int kind = PyUnicode_KIND(unicode);
11137
0
    const void *data = PyUnicode_DATA(unicode);
11138
    /* Compare Unicode string and UTF-8 string */
11139
0
    for (Py_ssize_t i = 0; i < len; i++) {
11140
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11141
0
        if (ch < 0x80) {
11142
0
            if (ends == s || s[0] != ch) {
11143
0
                return 0;
11144
0
            }
11145
0
            s += 1;
11146
0
        }
11147
0
        else if (ch < 0x800) {
11148
0
            if ((ends - s) < 2 ||
11149
0
                s[0] != (0xc0 | (ch >> 6)) ||
11150
0
                s[1] != (0x80 | (ch & 0x3f)))
11151
0
            {
11152
0
                return 0;
11153
0
            }
11154
0
            s += 2;
11155
0
        }
11156
0
        else if (ch < 0x10000) {
11157
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11158
0
                (ends - s) < 3 ||
11159
0
                s[0] != (0xe0 | (ch >> 12)) ||
11160
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11161
0
                s[2] != (0x80 | (ch & 0x3f)))
11162
0
            {
11163
0
                return 0;
11164
0
            }
11165
0
            s += 3;
11166
0
        }
11167
0
        else {
11168
0
            assert(ch <= MAX_UNICODE);
11169
0
            if ((ends - s) < 4 ||
11170
0
                s[0] != (0xf0 | (ch >> 18)) ||
11171
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11172
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11173
0
                s[3] != (0x80 | (ch & 0x3f)))
11174
0
            {
11175
0
                return 0;
11176
0
            }
11177
0
            s += 4;
11178
0
        }
11179
0
    }
11180
0
    return s == ends;
11181
0
}
11182
11183
int
11184
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11185
41.3M
{
11186
41.3M
    size_t len;
11187
41.3M
    assert(_PyUnicode_CHECK(unicode));
11188
41.3M
    assert(str);
11189
#ifndef NDEBUG
11190
    for (const char *p = str; *p; p++) {
11191
        assert((unsigned char)*p < 128);
11192
    }
11193
#endif
11194
41.3M
    if (!PyUnicode_IS_ASCII(unicode))
11195
143k
        return 0;
11196
41.1M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11197
41.1M
    return strlen(str) == len &&
11198
645k
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11199
41.3M
}
11200
11201
PyObject *
11202
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11203
69.1M
{
11204
69.1M
    int result;
11205
11206
69.1M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11207
231k
        Py_RETURN_NOTIMPLEMENTED;
11208
11209
68.9M
    if (left == right) {
11210
2.51k
        switch (op) {
11211
2.28k
        case Py_EQ:
11212
2.28k
        case Py_LE:
11213
2.28k
        case Py_GE:
11214
            /* a string is equal to itself */
11215
2.28k
            Py_RETURN_TRUE;
11216
230
        case Py_NE:
11217
230
        case Py_LT:
11218
230
        case Py_GT:
11219
230
            Py_RETURN_FALSE;
11220
0
        default:
11221
0
            PyErr_BadArgument();
11222
0
            return NULL;
11223
2.51k
        }
11224
2.51k
    }
11225
68.9M
    else if (op == Py_EQ || op == Py_NE) {
11226
38.4M
        result = unicode_eq(left, right);
11227
38.4M
        result ^= (op == Py_NE);
11228
38.4M
        return PyBool_FromLong(result);
11229
38.4M
    }
11230
30.5M
    else {
11231
30.5M
        result = unicode_compare(left, right);
11232
30.5M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11233
30.5M
    }
11234
68.9M
}
11235
11236
int
11237
PyUnicode_Contains(PyObject *str, PyObject *substr)
11238
227M
{
11239
227M
    int kind1, kind2;
11240
227M
    const void *buf1, *buf2;
11241
227M
    Py_ssize_t len1, len2;
11242
227M
    int result;
11243
11244
227M
    if (!PyUnicode_Check(substr)) {
11245
0
        PyErr_Format(PyExc_TypeError,
11246
0
                     "'in <string>' requires string as left operand, not %.100s",
11247
0
                     Py_TYPE(substr)->tp_name);
11248
0
        return -1;
11249
0
    }
11250
227M
    if (ensure_unicode(str) < 0)
11251
0
        return -1;
11252
11253
227M
    kind1 = PyUnicode_KIND(str);
11254
227M
    kind2 = PyUnicode_KIND(substr);
11255
227M
    if (kind1 < kind2)
11256
15.4M
        return 0;
11257
211M
    len1 = PyUnicode_GET_LENGTH(str);
11258
211M
    len2 = PyUnicode_GET_LENGTH(substr);
11259
211M
    if (len1 < len2)
11260
1.08M
        return 0;
11261
210M
    buf1 = PyUnicode_DATA(str);
11262
210M
    buf2 = PyUnicode_DATA(substr);
11263
210M
    if (len2 == 1) {
11264
188M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11265
188M
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11266
188M
        return result;
11267
188M
    }
11268
22.0M
    if (kind2 != kind1) {
11269
18.2k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11270
18.2k
        if (!buf2)
11271
0
            return -1;
11272
18.2k
    }
11273
11274
22.0M
    switch (kind1) {
11275
21.9M
    case PyUnicode_1BYTE_KIND:
11276
21.9M
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11277
21.9M
        break;
11278
13.7k
    case PyUnicode_2BYTE_KIND:
11279
13.7k
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11280
13.7k
        break;
11281
4.53k
    case PyUnicode_4BYTE_KIND:
11282
4.53k
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11283
4.53k
        break;
11284
0
    default:
11285
0
        Py_UNREACHABLE();
11286
22.0M
    }
11287
11288
22.0M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11289
22.0M
    if (kind2 != kind1)
11290
18.2k
        PyMem_Free((void *)buf2);
11291
11292
22.0M
    return result;
11293
22.0M
}
11294
11295
/* Concat to string or Unicode object giving a new Unicode object. */
11296
11297
PyObject *
11298
PyUnicode_Concat(PyObject *left, PyObject *right)
11299
25.5M
{
11300
25.5M
    PyObject *result;
11301
25.5M
    Py_UCS4 maxchar, maxchar2;
11302
25.5M
    Py_ssize_t left_len, right_len, new_len;
11303
11304
25.5M
    if (ensure_unicode(left) < 0)
11305
0
        return NULL;
11306
11307
25.5M
    if (!PyUnicode_Check(right)) {
11308
0
        PyErr_Format(PyExc_TypeError,
11309
0
            "can only concatenate str (not \"%.200s\") to str",
11310
0
            Py_TYPE(right)->tp_name);
11311
0
        return NULL;
11312
0
    }
11313
11314
    /* Shortcuts */
11315
25.5M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11316
25.5M
    if (left == empty) {
11317
439k
        return PyUnicode_FromObject(right);
11318
439k
    }
11319
25.0M
    if (right == empty) {
11320
1.68M
        return PyUnicode_FromObject(left);
11321
1.68M
    }
11322
11323
23.4M
    left_len = PyUnicode_GET_LENGTH(left);
11324
23.4M
    right_len = PyUnicode_GET_LENGTH(right);
11325
23.4M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11326
0
        PyErr_SetString(PyExc_OverflowError,
11327
0
                        "strings are too large to concat");
11328
0
        return NULL;
11329
0
    }
11330
23.4M
    new_len = left_len + right_len;
11331
11332
23.4M
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11333
23.4M
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11334
23.4M
    maxchar = Py_MAX(maxchar, maxchar2);
11335
11336
    /* Concat the two Unicode strings */
11337
23.4M
    result = PyUnicode_New(new_len, maxchar);
11338
23.4M
    if (result == NULL)
11339
0
        return NULL;
11340
23.4M
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11341
23.4M
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11342
23.4M
    assert(_PyUnicode_CheckConsistency(result, 1));
11343
23.4M
    return result;
11344
23.4M
}
11345
11346
void
11347
PyUnicode_Append(PyObject **p_left, PyObject *right)
11348
5.58M
{
11349
5.58M
    PyObject *left, *res;
11350
5.58M
    Py_UCS4 maxchar, maxchar2;
11351
5.58M
    Py_ssize_t left_len, right_len, new_len;
11352
11353
5.58M
    if (p_left == NULL) {
11354
0
        if (!PyErr_Occurred())
11355
0
            PyErr_BadInternalCall();
11356
0
        return;
11357
0
    }
11358
5.58M
    left = *p_left;
11359
5.58M
    if (right == NULL || left == NULL
11360
5.58M
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11361
0
        if (!PyErr_Occurred())
11362
0
            PyErr_BadInternalCall();
11363
0
        goto error;
11364
0
    }
11365
11366
    /* Shortcuts */
11367
5.58M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11368
5.58M
    if (left == empty) {
11369
426k
        Py_DECREF(left);
11370
426k
        *p_left = Py_NewRef(right);
11371
426k
        return;
11372
426k
    }
11373
5.15M
    if (right == empty) {
11374
12.7k
        return;
11375
12.7k
    }
11376
11377
5.14M
    left_len = PyUnicode_GET_LENGTH(left);
11378
5.14M
    right_len = PyUnicode_GET_LENGTH(right);
11379
5.14M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11380
0
        PyErr_SetString(PyExc_OverflowError,
11381
0
                        "strings are too large to concat");
11382
0
        goto error;
11383
0
    }
11384
5.14M
    new_len = left_len + right_len;
11385
11386
5.14M
    if (_PyUnicode_IsModifiable(left)
11387
5.14M
        && PyUnicode_CheckExact(right)
11388
5.14M
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11389
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11390
           to change the structure size, but characters are stored just after
11391
           the structure, and so it requires to move all characters which is
11392
           not so different than duplicating the string. */
11393
1.94M
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11394
1.94M
    {
11395
        /* append inplace */
11396
1.94M
        if (unicode_resize(p_left, new_len) != 0)
11397
0
            goto error;
11398
11399
        /* copy 'right' into the newly allocated area of 'left' */
11400
1.94M
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11401
1.94M
    }
11402
3.20M
    else {
11403
3.20M
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11404
3.20M
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11405
3.20M
        maxchar = Py_MAX(maxchar, maxchar2);
11406
11407
        /* Concat the two Unicode strings */
11408
3.20M
        res = PyUnicode_New(new_len, maxchar);
11409
3.20M
        if (res == NULL)
11410
0
            goto error;
11411
3.20M
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11412
3.20M
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11413
3.20M
        Py_DECREF(left);
11414
3.20M
        *p_left = res;
11415
3.20M
    }
11416
5.14M
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11417
5.14M
    return;
11418
11419
0
error:
11420
0
    Py_CLEAR(*p_left);
11421
0
}
11422
11423
void
11424
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11425
8
{
11426
8
    PyUnicode_Append(pleft, right);
11427
8
    Py_XDECREF(right);
11428
8
}
11429
11430
/*[clinic input]
11431
@permit_long_summary
11432
@text_signature "($self, sub[, start[, end]], /)"
11433
str.count as unicode_count -> Py_ssize_t
11434
11435
    self as str: self
11436
    sub as substr: unicode
11437
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11438
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11439
    /
11440
11441
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11442
11443
Optional arguments start and end are interpreted as in slice
11444
notation.
11445
[clinic start generated code]*/
11446
11447
static Py_ssize_t
11448
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11449
                   Py_ssize_t end)
11450
/*[clinic end generated code: output=8fcc3aef0b18edbf input=c9209e05438cc352]*/
11451
27.2M
{
11452
27.2M
    assert(PyUnicode_Check(str));
11453
27.2M
    assert(PyUnicode_Check(substr));
11454
11455
27.2M
    Py_ssize_t result;
11456
27.2M
    int kind1, kind2;
11457
27.2M
    const void *buf1 = NULL, *buf2 = NULL;
11458
27.2M
    Py_ssize_t len1, len2;
11459
11460
27.2M
    kind1 = PyUnicode_KIND(str);
11461
27.2M
    kind2 = PyUnicode_KIND(substr);
11462
27.2M
    if (kind1 < kind2)
11463
0
        return 0;
11464
11465
27.2M
    len1 = PyUnicode_GET_LENGTH(str);
11466
27.2M
    len2 = PyUnicode_GET_LENGTH(substr);
11467
27.2M
    ADJUST_INDICES(start, end, len1);
11468
27.2M
    if (end - start < len2)
11469
4.79M
        return 0;
11470
11471
22.5M
    buf1 = PyUnicode_DATA(str);
11472
22.5M
    buf2 = PyUnicode_DATA(substr);
11473
22.5M
    if (kind2 != kind1) {
11474
6.07M
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11475
6.07M
        if (!buf2)
11476
0
            goto onError;
11477
6.07M
    }
11478
11479
    // We don't reuse `anylib_count` here because of the explicit casts.
11480
22.5M
    switch (kind1) {
11481
16.4M
    case PyUnicode_1BYTE_KIND:
11482
16.4M
        result = ucs1lib_count(
11483
16.4M
            ((const Py_UCS1*)buf1) + start, end - start,
11484
16.4M
            buf2, len2, PY_SSIZE_T_MAX
11485
16.4M
            );
11486
16.4M
        break;
11487
3.71M
    case PyUnicode_2BYTE_KIND:
11488
3.71M
        result = ucs2lib_count(
11489
3.71M
            ((const Py_UCS2*)buf1) + start, end - start,
11490
3.71M
            buf2, len2, PY_SSIZE_T_MAX
11491
3.71M
            );
11492
3.71M
        break;
11493
2.35M
    case PyUnicode_4BYTE_KIND:
11494
2.35M
        result = ucs4lib_count(
11495
2.35M
            ((const Py_UCS4*)buf1) + start, end - start,
11496
2.35M
            buf2, len2, PY_SSIZE_T_MAX
11497
2.35M
            );
11498
2.35M
        break;
11499
0
    default:
11500
0
        Py_UNREACHABLE();
11501
22.5M
    }
11502
11503
22.5M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11504
22.5M
    if (kind2 != kind1)
11505
6.07M
        PyMem_Free((void *)buf2);
11506
11507
22.5M
    return result;
11508
0
  onError:
11509
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11510
0
    if (kind2 != kind1)
11511
0
        PyMem_Free((void *)buf2);
11512
0
    return -1;
11513
22.5M
}
11514
11515
/*[clinic input]
11516
str.encode as unicode_encode
11517
11518
    encoding: str(c_default="NULL") = 'utf-8'
11519
        The encoding in which to encode the string.
11520
    errors: str(c_default="NULL") = 'strict'
11521
        The error handling scheme to use for encoding errors.
11522
        The default is 'strict' meaning that encoding errors raise a
11523
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace'
11524
        and 'xmlcharrefreplace' as well as any other name registered with
11525
        codecs.register_error that can handle UnicodeEncodeErrors.
11526
11527
Encode the string using the codec registered for encoding.
11528
[clinic start generated code]*/
11529
11530
static PyObject *
11531
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11532
/*[clinic end generated code: output=bf78b6e2a9470e3c input=b85a9645cb33b729]*/
11533
18.9M
{
11534
18.9M
    return PyUnicode_AsEncodedString(self, encoding, errors);
11535
18.9M
}
11536
11537
/*[clinic input]
11538
str.expandtabs as unicode_expandtabs
11539
11540
    tabsize: int = 8
11541
11542
Return a copy where all tab characters are expanded using spaces.
11543
11544
If tabsize is not given, a tab size of 8 characters is assumed.
11545
[clinic start generated code]*/
11546
11547
static PyObject *
11548
unicode_expandtabs_impl(PyObject *self, int tabsize)
11549
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11550
1.31M
{
11551
1.31M
    Py_ssize_t i, j, line_pos, src_len, incr;
11552
1.31M
    Py_UCS4 ch;
11553
1.31M
    PyObject *u;
11554
1.31M
    const void *src_data;
11555
1.31M
    void *dest_data;
11556
1.31M
    int kind;
11557
1.31M
    int found;
11558
11559
    /* First pass: determine size of output string */
11560
1.31M
    src_len = PyUnicode_GET_LENGTH(self);
11561
1.31M
    i = j = line_pos = 0;
11562
1.31M
    kind = PyUnicode_KIND(self);
11563
1.31M
    src_data = PyUnicode_DATA(self);
11564
1.31M
    found = 0;
11565
2.78M
    for (; i < src_len; i++) {
11566
1.47M
        ch = PyUnicode_READ(kind, src_data, i);
11567
1.47M
        if (ch == '\t') {
11568
349k
            found = 1;
11569
349k
            if (tabsize > 0) {
11570
349k
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11571
349k
                if (j > PY_SSIZE_T_MAX - incr)
11572
0
                    goto overflow;
11573
349k
                line_pos += incr;
11574
349k
                j += incr;
11575
349k
            }
11576
349k
        }
11577
1.12M
        else {
11578
1.12M
            if (j > PY_SSIZE_T_MAX - 1)
11579
0
                goto overflow;
11580
1.12M
            line_pos++;
11581
1.12M
            j++;
11582
1.12M
            if (ch == '\n' || ch == '\r')
11583
2.62k
                line_pos = 0;
11584
1.12M
        }
11585
1.47M
    }
11586
1.31M
    if (!found)
11587
1.28M
        return unicode_result_unchanged(self);
11588
11589
    /* Second pass: create output string and fill it */
11590
33.8k
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11591
33.8k
    if (!u)
11592
0
        return NULL;
11593
33.8k
    dest_data = PyUnicode_DATA(u);
11594
11595
33.8k
    i = j = line_pos = 0;
11596
11597
699k
    for (; i < src_len; i++) {
11598
665k
        ch = PyUnicode_READ(kind, src_data, i);
11599
665k
        if (ch == '\t') {
11600
349k
            if (tabsize > 0) {
11601
349k
                incr = tabsize - (line_pos % tabsize);
11602
349k
                line_pos += incr;
11603
349k
                _PyUnicode_Fill(kind, dest_data, ' ', j, incr);
11604
349k
                j += incr;
11605
349k
            }
11606
349k
        }
11607
315k
        else {
11608
315k
            line_pos++;
11609
315k
            PyUnicode_WRITE(kind, dest_data, j, ch);
11610
315k
            j++;
11611
315k
            if (ch == '\n' || ch == '\r')
11612
0
                line_pos = 0;
11613
315k
        }
11614
665k
    }
11615
33.8k
    assert (j == PyUnicode_GET_LENGTH(u));
11616
33.8k
    return unicode_result(u);
11617
11618
0
  overflow:
11619
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11620
0
    return NULL;
11621
33.8k
}
11622
11623
/*[clinic input]
11624
@permit_long_summary
11625
str.find as unicode_find = str.count
11626
11627
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11628
11629
Optional arguments start and end are interpreted as in slice
11630
notation.  Return -1 on failure.
11631
[clinic start generated code]*/
11632
11633
static Py_ssize_t
11634
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11635
                  Py_ssize_t end)
11636
/*[clinic end generated code: output=51dbe6255712e278 input=f57e93c59d1ee927]*/
11637
26.9M
{
11638
26.9M
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11639
26.9M
    if (result < 0) {
11640
5.93M
        return -1;
11641
5.93M
    }
11642
21.0M
    return result;
11643
26.9M
}
11644
11645
static PyObject *
11646
unicode_getitem(PyObject *self, Py_ssize_t index)
11647
61.7M
{
11648
61.7M
    const void *data;
11649
61.7M
    int kind;
11650
61.7M
    Py_UCS4 ch;
11651
11652
61.7M
    if (!PyUnicode_Check(self)) {
11653
0
        PyErr_BadArgument();
11654
0
        return NULL;
11655
0
    }
11656
61.7M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11657
14.9k
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11658
14.9k
        return NULL;
11659
14.9k
    }
11660
61.7M
    kind = PyUnicode_KIND(self);
11661
61.7M
    data = PyUnicode_DATA(self);
11662
61.7M
    ch = PyUnicode_READ(kind, data, index);
11663
61.7M
    return unicode_char(ch);
11664
61.7M
}
11665
11666
/* Believe it or not, this produces the same value for ASCII strings
11667
   as bytes_hash(). */
11668
static Py_hash_t
11669
unicode_hash(PyObject *self)
11670
218M
{
11671
218M
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11672
11673
#ifdef Py_DEBUG
11674
    assert(_Py_HashSecret_Initialized);
11675
#endif
11676
218M
    Py_hash_t hash = PyUnicode_HASH(self);
11677
218M
    if (hash != -1) {
11678
168M
        return hash;
11679
168M
    }
11680
50.2M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11681
50.2M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11682
11683
50.2M
    PyUnicode_SET_HASH(self, x);
11684
50.2M
    return x;
11685
218M
}
11686
11687
/*[clinic input]
11688
@permit_long_summary
11689
str.index as unicode_index = str.count
11690
11691
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11692
11693
Optional arguments start and end are interpreted as in slice
11694
notation.  Raises ValueError when the substring is not found.
11695
[clinic start generated code]*/
11696
11697
static Py_ssize_t
11698
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11699
                   Py_ssize_t end)
11700
/*[clinic end generated code: output=77558288837cdf40 input=5900ab84de55e628]*/
11701
45.0k
{
11702
45.0k
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11703
45.0k
    if (result == -1) {
11704
712
        PyErr_SetString(PyExc_ValueError, "substring not found");
11705
712
    }
11706
44.3k
    else if (result < 0) {
11707
0
        return -1;
11708
0
    }
11709
45.0k
    return result;
11710
45.0k
}
11711
11712
/*[clinic input]
11713
@permit_long_summary
11714
str.isascii as unicode_isascii
11715
11716
Return True if all characters in the string are ASCII, False otherwise.
11717
11718
ASCII characters have code points in the range U+0000-U+007F.
11719
Empty string is ASCII too.
11720
[clinic start generated code]*/
11721
11722
static PyObject *
11723
unicode_isascii_impl(PyObject *self)
11724
/*[clinic end generated code: output=c5910d64b5a8003f input=dc74e1ced821159f]*/
11725
5.31k
{
11726
5.31k
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11727
5.31k
}
11728
11729
/*[clinic input]
11730
str.islower as unicode_islower
11731
11732
Return True if the string is a lowercase string, False otherwise.
11733
11734
A string is lowercase if all cased characters in the string are
11735
lowercase and there is at least one cased character in the string.
11736
[clinic start generated code]*/
11737
11738
static PyObject *
11739
unicode_islower_impl(PyObject *self)
11740
/*[clinic end generated code: output=dbd41995bd005b81 input=1879b48dfc628366]*/
11741
0
{
11742
0
    Py_ssize_t i, length;
11743
0
    int kind;
11744
0
    const void *data;
11745
0
    int cased;
11746
11747
0
    length = PyUnicode_GET_LENGTH(self);
11748
0
    kind = PyUnicode_KIND(self);
11749
0
    data = PyUnicode_DATA(self);
11750
11751
    /* Shortcut for single character strings */
11752
0
    if (length == 1)
11753
0
        return PyBool_FromLong(
11754
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11755
11756
    /* Special case for empty strings */
11757
0
    if (length == 0)
11758
0
        Py_RETURN_FALSE;
11759
11760
0
    cased = 0;
11761
0
    for (i = 0; i < length; i++) {
11762
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11763
11764
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11765
0
            Py_RETURN_FALSE;
11766
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
11767
0
            cased = 1;
11768
0
    }
11769
0
    return PyBool_FromLong(cased);
11770
0
}
11771
11772
/*[clinic input]
11773
str.isupper as unicode_isupper
11774
11775
Return True if the string is an uppercase string, False otherwise.
11776
11777
A string is uppercase if all cased characters in the string are
11778
uppercase and there is at least one cased character in the string.
11779
[clinic start generated code]*/
11780
11781
static PyObject *
11782
unicode_isupper_impl(PyObject *self)
11783
/*[clinic end generated code: output=049209c8e7f15f59 input=77d29904aef0e3a0]*/
11784
10.7k
{
11785
10.7k
    Py_ssize_t i, length;
11786
10.7k
    int kind;
11787
10.7k
    const void *data;
11788
10.7k
    int cased;
11789
11790
10.7k
    length = PyUnicode_GET_LENGTH(self);
11791
10.7k
    kind = PyUnicode_KIND(self);
11792
10.7k
    data = PyUnicode_DATA(self);
11793
11794
    /* Shortcut for single character strings */
11795
10.7k
    if (length == 1)
11796
0
        return PyBool_FromLong(
11797
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11798
11799
    /* Special case for empty strings */
11800
10.7k
    if (length == 0)
11801
0
        Py_RETURN_FALSE;
11802
11803
10.7k
    cased = 0;
11804
135k
    for (i = 0; i < length; i++) {
11805
126k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11806
11807
126k
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11808
1.36k
            Py_RETURN_FALSE;
11809
124k
        else if (!cased && Py_UNICODE_ISUPPER(ch))
11810
9.56k
            cased = 1;
11811
126k
    }
11812
9.42k
    return PyBool_FromLong(cased);
11813
10.7k
}
11814
11815
/*[clinic input]
11816
str.istitle as unicode_istitle
11817
11818
Return True if the string is a title-cased string, False otherwise.
11819
11820
In a title-cased string, upper- and title-case characters may only
11821
follow uncased characters and lowercase characters only cased ones.
11822
[clinic start generated code]*/
11823
11824
static PyObject *
11825
unicode_istitle_impl(PyObject *self)
11826
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11827
0
{
11828
0
    Py_ssize_t i, length;
11829
0
    int kind;
11830
0
    const void *data;
11831
0
    int cased, previous_is_cased;
11832
11833
0
    length = PyUnicode_GET_LENGTH(self);
11834
0
    kind = PyUnicode_KIND(self);
11835
0
    data = PyUnicode_DATA(self);
11836
11837
    /* Shortcut for single character strings */
11838
0
    if (length == 1) {
11839
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11840
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11841
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
11842
0
    }
11843
11844
    /* Special case for empty strings */
11845
0
    if (length == 0)
11846
0
        Py_RETURN_FALSE;
11847
11848
0
    cased = 0;
11849
0
    previous_is_cased = 0;
11850
0
    for (i = 0; i < length; i++) {
11851
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11852
11853
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11854
0
            if (previous_is_cased)
11855
0
                Py_RETURN_FALSE;
11856
0
            previous_is_cased = 1;
11857
0
            cased = 1;
11858
0
        }
11859
0
        else if (Py_UNICODE_ISLOWER(ch)) {
11860
0
            if (!previous_is_cased)
11861
0
                Py_RETURN_FALSE;
11862
0
            previous_is_cased = 1;
11863
0
            cased = 1;
11864
0
        }
11865
0
        else
11866
0
            previous_is_cased = 0;
11867
0
    }
11868
0
    return PyBool_FromLong(cased);
11869
0
}
11870
11871
/*[clinic input]
11872
str.isspace as unicode_isspace
11873
11874
Return True if the string is a whitespace string, False otherwise.
11875
11876
A string is whitespace if all characters in the string are
11877
whitespace and there is at least one character in the string.
11878
[clinic start generated code]*/
11879
11880
static PyObject *
11881
unicode_isspace_impl(PyObject *self)
11882
/*[clinic end generated code: output=163a63bfa08ac2b9 input=29e09560fc23fbeb]*/
11883
1.32M
{
11884
1.32M
    Py_ssize_t i, length;
11885
1.32M
    int kind;
11886
1.32M
    const void *data;
11887
11888
1.32M
    length = PyUnicode_GET_LENGTH(self);
11889
1.32M
    kind = PyUnicode_KIND(self);
11890
1.32M
    data = PyUnicode_DATA(self);
11891
11892
    /* Shortcut for single character strings */
11893
1.32M
    if (length == 1)
11894
1.31M
        return PyBool_FromLong(
11895
1.31M
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11896
11897
    /* Special case for empty strings */
11898
1.33k
    if (length == 0)
11899
284
        Py_RETURN_FALSE;
11900
11901
7.60k
    for (i = 0; i < length; i++) {
11902
7.50k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11903
7.50k
        if (!Py_UNICODE_ISSPACE(ch))
11904
944
            Py_RETURN_FALSE;
11905
7.50k
    }
11906
1.04k
    Py_RETURN_TRUE;
11907
1.04k
}
11908
11909
/*[clinic input]
11910
str.isalpha as unicode_isalpha
11911
11912
Return True if the string is an alphabetic string, False otherwise.
11913
11914
A string is alphabetic if all characters in the string are
11915
alphabetic and there is at least one character in the string.
11916
[clinic start generated code]*/
11917
11918
static PyObject *
11919
unicode_isalpha_impl(PyObject *self)
11920
/*[clinic end generated code: output=cc81b9ac3883ec4f input=9906a07f3e04892e]*/
11921
19
{
11922
19
    Py_ssize_t i, length;
11923
19
    int kind;
11924
19
    const void *data;
11925
11926
19
    length = PyUnicode_GET_LENGTH(self);
11927
19
    kind = PyUnicode_KIND(self);
11928
19
    data = PyUnicode_DATA(self);
11929
11930
    /* Shortcut for single character strings */
11931
19
    if (length == 1)
11932
14
        return PyBool_FromLong(
11933
14
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11934
11935
    /* Special case for empty strings */
11936
5
    if (length == 0)
11937
0
        Py_RETURN_FALSE;
11938
11939
5
    for (i = 0; i < length; i++) {
11940
5
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11941
5
            Py_RETURN_FALSE;
11942
5
    }
11943
5
    Py_RETURN_TRUE;
11944
5
}
11945
11946
/*[clinic input]
11947
@permit_long_summary
11948
str.isalnum as unicode_isalnum
11949
11950
Return True if the string is an alpha-numeric string, False otherwise.
11951
11952
A string is alpha-numeric if all characters in the string are
11953
alpha-numeric and there is at least one character in the string.
11954
[clinic start generated code]*/
11955
11956
static PyObject *
11957
unicode_isalnum_impl(PyObject *self)
11958
/*[clinic end generated code: output=a5a23490ffc3660c input=892f64ebc171fd4f]*/
11959
0
{
11960
0
    int kind;
11961
0
    const void *data;
11962
0
    Py_ssize_t len, i;
11963
11964
0
    kind = PyUnicode_KIND(self);
11965
0
    data = PyUnicode_DATA(self);
11966
0
    len = PyUnicode_GET_LENGTH(self);
11967
11968
    /* Shortcut for single character strings */
11969
0
    if (len == 1) {
11970
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11971
0
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11972
0
    }
11973
11974
    /* Special case for empty strings */
11975
0
    if (len == 0)
11976
0
        Py_RETURN_FALSE;
11977
11978
0
    for (i = 0; i < len; i++) {
11979
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11980
0
        if (!Py_UNICODE_ISALNUM(ch))
11981
0
            Py_RETURN_FALSE;
11982
0
    }
11983
0
    Py_RETURN_TRUE;
11984
0
}
11985
11986
/*[clinic input]
11987
str.isdecimal as unicode_isdecimal
11988
11989
Return True if the string is a decimal string, False otherwise.
11990
11991
A string is a decimal string if all characters in the string are
11992
decimal and there is at least one character in the string.
11993
[clinic start generated code]*/
11994
11995
static PyObject *
11996
unicode_isdecimal_impl(PyObject *self)
11997
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=63b0453c48cad0af]*/
11998
1.43k
{
11999
1.43k
    Py_ssize_t i, length;
12000
1.43k
    int kind;
12001
1.43k
    const void *data;
12002
12003
1.43k
    length = PyUnicode_GET_LENGTH(self);
12004
1.43k
    kind = PyUnicode_KIND(self);
12005
1.43k
    data = PyUnicode_DATA(self);
12006
12007
    /* Shortcut for single character strings */
12008
1.43k
    if (length == 1)
12009
214
        return PyBool_FromLong(
12010
214
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12011
12012
    /* Special case for empty strings */
12013
1.22k
    if (length == 0)
12014
0
        Py_RETURN_FALSE;
12015
12016
8.04k
    for (i = 0; i < length; i++) {
12017
7.45k
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12018
636
            Py_RETURN_FALSE;
12019
7.45k
    }
12020
1.22k
    Py_RETURN_TRUE;
12021
1.22k
}
12022
12023
/*[clinic input]
12024
str.isdigit as unicode_isdigit
12025
12026
Return True if the string is a digit string, False otherwise.
12027
12028
A string is a digit string if all characters in the string are
12029
digits and there is at least one character in the string.
12030
[clinic start generated code]*/
12031
12032
static PyObject *
12033
unicode_isdigit_impl(PyObject *self)
12034
/*[clinic end generated code: output=10a6985311da6858 input=353b03747b062e4b]*/
12035
1.17M
{
12036
1.17M
    Py_ssize_t i, length;
12037
1.17M
    int kind;
12038
1.17M
    const void *data;
12039
12040
1.17M
    length = PyUnicode_GET_LENGTH(self);
12041
1.17M
    kind = PyUnicode_KIND(self);
12042
1.17M
    data = PyUnicode_DATA(self);
12043
12044
    /* Shortcut for single character strings */
12045
1.17M
    if (length == 1) {
12046
1.17M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12047
1.17M
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12048
1.17M
    }
12049
12050
    /* Special case for empty strings */
12051
408
    if (length == 0)
12052
0
        Py_RETURN_FALSE;
12053
12054
1.45k
    for (i = 0; i < length; i++) {
12055
1.04k
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12056
0
            Py_RETURN_FALSE;
12057
1.04k
    }
12058
408
    Py_RETURN_TRUE;
12059
408
}
12060
12061
/*[clinic input]
12062
str.isnumeric as unicode_isnumeric
12063
12064
Return True if the string is a numeric string, False otherwise.
12065
12066
A string is numeric if all characters in the string are numeric and
12067
there is at least one character in the string.
12068
[clinic start generated code]*/
12069
12070
static PyObject *
12071
unicode_isnumeric_impl(PyObject *self)
12072
/*[clinic end generated code: output=9172a32d9013051a input=83b2a072ed7aff48]*/
12073
0
{
12074
0
    Py_ssize_t i, length;
12075
0
    int kind;
12076
0
    const void *data;
12077
12078
0
    length = PyUnicode_GET_LENGTH(self);
12079
0
    kind = PyUnicode_KIND(self);
12080
0
    data = PyUnicode_DATA(self);
12081
12082
    /* Shortcut for single character strings */
12083
0
    if (length == 1)
12084
0
        return PyBool_FromLong(
12085
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12086
12087
    /* Special case for empty strings */
12088
0
    if (length == 0)
12089
0
        Py_RETURN_FALSE;
12090
12091
0
    for (i = 0; i < length; i++) {
12092
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12093
0
            Py_RETURN_FALSE;
12094
0
    }
12095
0
    Py_RETURN_TRUE;
12096
0
}
12097
12098
Py_ssize_t
12099
_PyUnicode_ScanIdentifier(PyObject *self)
12100
66.5k
{
12101
66.5k
    Py_ssize_t i;
12102
66.5k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12103
66.5k
    if (len == 0) {
12104
        /* an empty string is not a valid identifier */
12105
0
        return 0;
12106
0
    }
12107
12108
66.5k
    int kind = PyUnicode_KIND(self);
12109
66.5k
    const void *data = PyUnicode_DATA(self);
12110
66.5k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12111
    /* PEP 3131 says that the first character must be in
12112
       XID_Start and subsequent characters in XID_Continue,
12113
       and for the ASCII range, the 2.x rules apply (i.e
12114
       start with letters and underscore, continue with
12115
       letters, digits, underscore). However, given the current
12116
       definition of XID_Start and XID_Continue, it is sufficient
12117
       to check just for these, except that _ must be allowed
12118
       as starting an identifier.  */
12119
66.5k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12120
820
        return 0;
12121
820
    }
12122
12123
575k
    for (i = 1; i < len; i++) {
12124
509k
        ch = PyUnicode_READ(kind, data, i);
12125
509k
        if (!_PyUnicode_IsXidContinue(ch)) {
12126
350
            return i;
12127
350
        }
12128
509k
    }
12129
65.3k
    return i;
12130
65.6k
}
12131
12132
int
12133
PyUnicode_IsIdentifier(PyObject *self)
12134
56.0k
{
12135
56.0k
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12136
56.0k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12137
    /* an empty string is not a valid identifier */
12138
56.0k
    return len && i == len;
12139
56.0k
}
12140
12141
/*[clinic input]
12142
@permit_long_summary
12143
str.isidentifier as unicode_isidentifier
12144
12145
Return True if the string is a valid Python identifier, False otherwise.
12146
12147
Call keyword.iskeyword(s) to test whether string s is a reserved
12148
identifier, such as "def" or "class".
12149
[clinic start generated code]*/
12150
12151
static PyObject *
12152
unicode_isidentifier_impl(PyObject *self)
12153
/*[clinic end generated code: output=fe585a9666572905 input=cabde62c20a3be6b]*/
12154
53.6k
{
12155
53.6k
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12156
53.6k
}
12157
12158
/*[clinic input]
12159
@permit_long_summary
12160
str.isprintable as unicode_isprintable
12161
12162
Return True if all characters in the string are printable, False otherwise.
12163
12164
A character is printable if repr() may use it in its output.
12165
[clinic start generated code]*/
12166
12167
static PyObject *
12168
unicode_isprintable_impl(PyObject *self)
12169
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=18345ba847084ec5]*/
12170
1.55M
{
12171
1.55M
    Py_ssize_t i, length;
12172
1.55M
    int kind;
12173
1.55M
    const void *data;
12174
12175
1.55M
    length = PyUnicode_GET_LENGTH(self);
12176
1.55M
    kind = PyUnicode_KIND(self);
12177
1.55M
    data = PyUnicode_DATA(self);
12178
12179
    /* Shortcut for single character strings */
12180
1.55M
    if (length == 1)
12181
1.55M
        return PyBool_FromLong(
12182
1.55M
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12183
12184
0
    for (i = 0; i < length; i++) {
12185
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12186
0
            Py_RETURN_FALSE;
12187
0
        }
12188
0
    }
12189
0
    Py_RETURN_TRUE;
12190
0
}
12191
12192
/*[clinic input]
12193
str.join as unicode_join
12194
12195
    iterable: object
12196
    /
12197
12198
Concatenate any number of strings.
12199
12200
The string whose method is called is inserted in between each given
12201
string.  The result is returned as a new string.
12202
12203
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12204
[clinic start generated code]*/
12205
12206
static PyObject *
12207
unicode_join(PyObject *self, PyObject *iterable)
12208
/*[clinic end generated code: output=6857e7cecfe7bf98 input=fd330a11ee845fb2]*/
12209
19.2M
{
12210
19.2M
    return PyUnicode_Join(self, iterable);
12211
19.2M
}
12212
12213
static Py_ssize_t
12214
unicode_length(PyObject *self)
12215
32.4M
{
12216
32.4M
    return PyUnicode_GET_LENGTH(self);
12217
32.4M
}
12218
12219
/*[clinic input]
12220
str.ljust as unicode_ljust
12221
12222
    width: Py_ssize_t
12223
    fillchar: Py_UCS4 = ' '
12224
    /
12225
12226
Return a left-justified string of length width.
12227
12228
Padding is done using the specified fill character (default is
12229
a space).
12230
[clinic start generated code]*/
12231
12232
static PyObject *
12233
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12234
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=8a55f06694c20ed6]*/
12235
130
{
12236
130
    if (PyUnicode_GET_LENGTH(self) >= width)
12237
62
        return unicode_result_unchanged(self);
12238
12239
68
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12240
130
}
12241
12242
/*[clinic input]
12243
str.lower as unicode_lower
12244
12245
Return a copy of the string converted to lowercase.
12246
[clinic start generated code]*/
12247
12248
static PyObject *
12249
unicode_lower_impl(PyObject *self)
12250
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12251
72.6M
{
12252
72.6M
    if (PyUnicode_IS_ASCII(self))
12253
67.8M
        return ascii_upper_or_lower(self, 1);
12254
4.83M
    return case_operation(self, do_lower);
12255
72.6M
}
12256
12257
50.0M
#define LEFTSTRIP 0
12258
55.1M
#define RIGHTSTRIP 1
12259
38.7M
#define BOTHSTRIP 2
12260
12261
/* Arrays indexed by above */
12262
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12263
12264
0
#define STRIPNAME(i) (stripfuncnames[i])
12265
12266
/* externally visible for str.strip(unicode) */
12267
PyObject *
12268
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12269
4.43M
{
12270
4.43M
    const void *data;
12271
4.43M
    int kind;
12272
4.43M
    Py_ssize_t i, j, len;
12273
4.43M
    BLOOM_MASK sepmask;
12274
4.43M
    Py_ssize_t seplen;
12275
12276
4.43M
    kind = PyUnicode_KIND(self);
12277
4.43M
    data = PyUnicode_DATA(self);
12278
4.43M
    len = PyUnicode_GET_LENGTH(self);
12279
4.43M
    seplen = PyUnicode_GET_LENGTH(sepobj);
12280
4.43M
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12281
4.43M
                              PyUnicode_DATA(sepobj),
12282
4.43M
                              seplen);
12283
12284
4.43M
    i = 0;
12285
4.43M
    if (striptype != RIGHTSTRIP) {
12286
460k
        while (i < len) {
12287
457k
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12288
457k
            if (!BLOOM(sepmask, ch))
12289
424k
                break;
12290
33.2k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12291
2.67k
                break;
12292
30.5k
            i++;
12293
30.5k
        }
12294
430k
    }
12295
12296
4.43M
    j = len;
12297
4.43M
    if (striptype != LEFTSTRIP) {
12298
4.00M
        j--;
12299
4.65M
        while (j >= i) {
12300
3.50M
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12301
3.50M
            if (!BLOOM(sepmask, ch))
12302
2.71M
                break;
12303
785k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12304
130k
                break;
12305
655k
            j--;
12306
655k
        }
12307
12308
4.00M
        j++;
12309
4.00M
    }
12310
12311
4.43M
    return PyUnicode_Substring(self, i, j);
12312
4.43M
}
12313
12314
PyObject*
12315
_PyUnicode_BinarySlice(PyObject *container, PyObject *start_o, PyObject *stop_o)
12316
31.8M
{
12317
31.8M
    assert(PyUnicode_CheckExact(container));
12318
31.8M
    Py_ssize_t len = PyUnicode_GET_LENGTH(container);
12319
31.8M
    Py_ssize_t istart, istop;
12320
31.8M
    if (!_PyEval_UnpackIndices(start_o, stop_o, len, &istart, &istop)) {
12321
0
        return NULL;
12322
0
    }
12323
31.8M
    return PyUnicode_Substring(container, istart, istop);
12324
31.8M
}
12325
12326
PyObject*
12327
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12328
234M
{
12329
234M
    const unsigned char *data;
12330
234M
    int kind;
12331
234M
    Py_ssize_t length;
12332
12333
234M
    length = PyUnicode_GET_LENGTH(self);
12334
234M
    end = Py_MIN(end, length);
12335
12336
234M
    if (start == 0 && end == length)
12337
48.2M
        return unicode_result_unchanged(self);
12338
12339
186M
    if (start < 0 || end < 0) {
12340
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12341
0
        return NULL;
12342
0
    }
12343
186M
    if (start >= length || end < start)
12344
1.40M
        _Py_RETURN_UNICODE_EMPTY();
12345
12346
185M
    length = end - start;
12347
185M
    if (PyUnicode_IS_ASCII(self)) {
12348
57.9M
        data = PyUnicode_1BYTE_DATA(self);
12349
57.9M
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12350
57.9M
    }
12351
127M
    else {
12352
127M
        kind = PyUnicode_KIND(self);
12353
127M
        data = PyUnicode_1BYTE_DATA(self);
12354
127M
        return PyUnicode_FromKindAndData(kind,
12355
127M
                                         data + kind * start,
12356
127M
                                         length);
12357
127M
    }
12358
185M
}
12359
12360
static PyObject *
12361
do_strip(PyObject *self, int striptype)
12362
43.5M
{
12363
43.5M
    Py_ssize_t len, i, j;
12364
12365
43.5M
    len = PyUnicode_GET_LENGTH(self);
12366
12367
43.5M
    if (PyUnicode_IS_ASCII(self)) {
12368
34.7M
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12369
12370
34.7M
        i = 0;
12371
34.7M
        if (striptype != RIGHTSTRIP) {
12372
33.1M
            while (i < len) {
12373
28.4M
                Py_UCS1 ch = data[i];
12374
28.4M
                if (!_Py_ascii_whitespace[ch])
12375
27.2M
                    break;
12376
1.24M
                i++;
12377
1.24M
            }
12378
31.9M
        }
12379
12380
34.7M
        j = len;
12381
34.7M
        if (striptype != LEFTSTRIP) {
12382
34.4M
            j--;
12383
36.4M
            while (j >= i) {
12384
30.5M
                Py_UCS1 ch = data[j];
12385
30.5M
                if (!_Py_ascii_whitespace[ch])
12386
28.5M
                    break;
12387
2.01M
                j--;
12388
2.01M
            }
12389
34.4M
            j++;
12390
34.4M
        }
12391
34.7M
    }
12392
8.76M
    else {
12393
8.76M
        int kind = PyUnicode_KIND(self);
12394
8.76M
        const void *data = PyUnicode_DATA(self);
12395
12396
8.76M
        i = 0;
12397
8.76M
        if (striptype != RIGHTSTRIP) {
12398
9.64M
            while (i < len) {
12399
9.64M
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12400
9.64M
                if (!Py_UNICODE_ISSPACE(ch))
12401
8.43M
                    break;
12402
1.20M
                i++;
12403
1.20M
            }
12404
8.43M
        }
12405
12406
8.76M
        j = len;
12407
8.76M
        if (striptype != LEFTSTRIP) {
12408
7.49M
            j--;
12409
8.33M
            while (j >= i) {
12410
8.31M
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12411
8.31M
                if (!Py_UNICODE_ISSPACE(ch))
12412
7.47M
                    break;
12413
844k
                j--;
12414
844k
            }
12415
7.49M
            j++;
12416
7.49M
        }
12417
8.76M
    }
12418
12419
43.5M
    return PyUnicode_Substring(self, i, j);
12420
43.5M
}
12421
12422
12423
static PyObject *
12424
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12425
47.9M
{
12426
47.9M
    if (sep != Py_None) {
12427
4.43M
        if (PyUnicode_Check(sep))
12428
4.43M
            return _PyUnicode_XStrip(self, striptype, sep);
12429
0
        else {
12430
0
            PyErr_Format(PyExc_TypeError,
12431
0
                         "%s arg must be None or str",
12432
0
                         STRIPNAME(striptype));
12433
0
            return NULL;
12434
0
        }
12435
4.43M
    }
12436
12437
43.5M
    return do_strip(self, striptype);
12438
47.9M
}
12439
12440
12441
/*[clinic input]
12442
@permit_long_summary
12443
str.strip as unicode_strip
12444
12445
    chars: object = None
12446
    /
12447
12448
Return a copy of the string with leading and trailing whitespace removed.
12449
12450
If chars is given and not None, remove characters in chars instead.
12451
[clinic start generated code]*/
12452
12453
static PyObject *
12454
unicode_strip_impl(PyObject *self, PyObject *chars)
12455
/*[clinic end generated code: output=ca19018454345d57 input=8bc6353450345fbd]*/
12456
38.7M
{
12457
38.7M
    return do_argstrip(self, BOTHSTRIP, chars);
12458
38.7M
}
12459
12460
12461
/*[clinic input]
12462
str.lstrip as unicode_lstrip
12463
12464
    chars: object = None
12465
    /
12466
12467
Return a copy of the string with leading whitespace removed.
12468
12469
If chars is given and not None, remove characters in chars instead.
12470
[clinic start generated code]*/
12471
12472
static PyObject *
12473
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12474
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12475
2.05M
{
12476
2.05M
    return do_argstrip(self, LEFTSTRIP, chars);
12477
2.05M
}
12478
12479
12480
/*[clinic input]
12481
str.rstrip as unicode_rstrip
12482
12483
    chars: object = None
12484
    /
12485
12486
Return a copy of the string with trailing whitespace removed.
12487
12488
If chars is given and not None, remove characters in chars instead.
12489
[clinic start generated code]*/
12490
12491
static PyObject *
12492
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12493
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12494
7.16M
{
12495
7.16M
    return do_argstrip(self, RIGHTSTRIP, chars);
12496
7.16M
}
12497
12498
12499
PyObject *
12500
_PyUnicode_Repeat(PyObject *str, Py_ssize_t len)
12501
326k
{
12502
326k
    PyObject *u;
12503
326k
    Py_ssize_t nchars, n;
12504
12505
326k
    if (len < 1)
12506
33.9k
        _Py_RETURN_UNICODE_EMPTY();
12507
12508
    /* no repeat, return original string */
12509
292k
    if (len == 1)
12510
27.0k
        return unicode_result_unchanged(str);
12511
12512
265k
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12513
0
        PyErr_SetString(PyExc_OverflowError,
12514
0
                        "repeated string is too long");
12515
0
        return NULL;
12516
0
    }
12517
265k
    nchars = len * PyUnicode_GET_LENGTH(str);
12518
12519
265k
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12520
265k
    if (!u)
12521
0
        return NULL;
12522
265k
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12523
12524
265k
    if (PyUnicode_GET_LENGTH(str) == 1) {
12525
263k
        int kind = PyUnicode_KIND(str);
12526
263k
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12527
263k
        if (kind == PyUnicode_1BYTE_KIND) {
12528
263k
            void *to = PyUnicode_DATA(u);
12529
263k
            memset(to, (unsigned char)fill_char, len);
12530
263k
        }
12531
0
        else if (kind == PyUnicode_2BYTE_KIND) {
12532
0
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12533
0
            for (n = 0; n < len; ++n)
12534
0
                ucs2[n] = fill_char;
12535
0
        } else {
12536
0
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12537
0
            assert(kind == PyUnicode_4BYTE_KIND);
12538
0
            for (n = 0; n < len; ++n)
12539
0
                ucs4[n] = fill_char;
12540
0
        }
12541
263k
    }
12542
2.06k
    else {
12543
2.06k
        Py_ssize_t char_size = PyUnicode_KIND(str);
12544
2.06k
        char *to = (char *) PyUnicode_DATA(u);
12545
2.06k
        _PyBytes_RepeatBuffer(to, nchars * char_size, PyUnicode_DATA(str),
12546
2.06k
            PyUnicode_GET_LENGTH(str) * char_size);
12547
2.06k
    }
12548
12549
265k
    assert(_PyUnicode_CheckConsistency(u, 1));
12550
265k
    return u;
12551
265k
}
12552
12553
PyObject *
12554
PyUnicode_Replace(PyObject *str,
12555
                  PyObject *substr,
12556
                  PyObject *replstr,
12557
                  Py_ssize_t maxcount)
12558
0
{
12559
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12560
0
            ensure_unicode(replstr) < 0)
12561
0
        return NULL;
12562
0
    return replace(str, substr, replstr, maxcount);
12563
0
}
12564
12565
/*[clinic input]
12566
str.replace as unicode_replace
12567
12568
    old: unicode
12569
    new: unicode
12570
    /
12571
    count: Py_ssize_t = -1
12572
        Maximum number of occurrences to replace.
12573
        -1 (the default value) means replace all occurrences.
12574
12575
Return a copy with all occurrences of substring old replaced by new.
12576
12577
If count is given, only the first count occurrences are replaced.
12578
If count is not specified or -1, then all occurrences are replaced.
12579
[clinic start generated code]*/
12580
12581
static PyObject *
12582
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12583
                     Py_ssize_t count)
12584
/*[clinic end generated code: output=b63f1a8b5eebf448 input=d15a6886b05e2edc]*/
12585
20.8M
{
12586
20.8M
    return replace(self, old, new, count);
12587
20.8M
}
12588
12589
/*[clinic input]
12590
str.removeprefix as unicode_removeprefix
12591
12592
    prefix: unicode
12593
    /
12594
12595
Return a str with the given prefix string removed if present.
12596
12597
If the string starts with the prefix string, return
12598
string[len(prefix):].  Otherwise, return a copy of the original
12599
string.
12600
[clinic start generated code]*/
12601
12602
static PyObject *
12603
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12604
/*[clinic end generated code: output=f1e5945e9763bcb9 input=90d162724944bfa7]*/
12605
28
{
12606
28
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12607
28
    if (match == -1) {
12608
0
        return NULL;
12609
0
    }
12610
28
    if (match) {
12611
28
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12612
28
                                   PyUnicode_GET_LENGTH(self));
12613
28
    }
12614
0
    return unicode_result_unchanged(self);
12615
28
}
12616
12617
/*[clinic input]
12618
str.removesuffix as unicode_removesuffix
12619
12620
    suffix: unicode
12621
    /
12622
12623
Return a str with the given suffix string removed if present.
12624
12625
If the string ends with the suffix string and that suffix is not
12626
empty, return string[:-len(suffix)].  Otherwise, return a copy of
12627
the original string.
12628
[clinic start generated code]*/
12629
12630
static PyObject *
12631
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12632
/*[clinic end generated code: output=d36629e227636822 input=6efc96152d4bfcd5]*/
12633
0
{
12634
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12635
0
    if (match == -1) {
12636
0
        return NULL;
12637
0
    }
12638
0
    if (match) {
12639
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12640
0
                                            - PyUnicode_GET_LENGTH(suffix));
12641
0
    }
12642
0
    return unicode_result_unchanged(self);
12643
0
}
12644
12645
static PyObject *
12646
unicode_repr(PyObject *unicode)
12647
7.10M
{
12648
7.10M
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12649
7.10M
    const void *idata = PyUnicode_DATA(unicode);
12650
12651
    /* Compute length of output, quote characters, and
12652
       maximum character */
12653
7.10M
    Py_ssize_t osize = 0;
12654
7.10M
    Py_UCS4 maxch = 127;
12655
7.10M
    Py_ssize_t squote = 0;
12656
7.10M
    Py_ssize_t dquote = 0;
12657
7.10M
    int ikind = PyUnicode_KIND(unicode);
12658
187M
    for (Py_ssize_t i = 0; i < isize; i++) {
12659
180M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12660
180M
        Py_ssize_t incr = 1;
12661
180M
        switch (ch) {
12662
351k
        case '\'': squote++; break;
12663
463k
        case '"':  dquote++; break;
12664
481k
        case '\\': case '\t': case '\r': case '\n':
12665
481k
            incr = 2;
12666
481k
            break;
12667
178M
        default:
12668
            /* Fast-path ASCII */
12669
178M
            if (ch < ' ' || ch == 0x7f)
12670
109M
                incr = 4; /* \xHH */
12671
69.2M
            else if (ch < 0x7f)
12672
58.7M
                ;
12673
10.4M
            else if (Py_UNICODE_ISPRINTABLE(ch))
12674
10.2M
                maxch = (ch > maxch) ? ch : maxch;
12675
225k
            else if (ch < 0x100)
12676
70.0k
                incr = 4; /* \xHH */
12677
155k
            else if (ch < 0x10000)
12678
64.7k
                incr = 6; /* \uHHHH */
12679
90.7k
            else
12680
90.7k
                incr = 10; /* \uHHHHHHHH */
12681
180M
        }
12682
180M
        if (osize > PY_SSIZE_T_MAX - incr) {
12683
0
            PyErr_SetString(PyExc_OverflowError,
12684
0
                            "string is too long to generate repr");
12685
0
            return NULL;
12686
0
        }
12687
180M
        osize += incr;
12688
180M
    }
12689
12690
7.10M
    Py_UCS4 quote = '\'';
12691
7.10M
    int changed = (osize != isize);
12692
7.10M
    if (squote) {
12693
83.0k
        changed = 1;
12694
83.0k
        if (dquote)
12695
            /* Both squote and dquote present. Use squote,
12696
               and escape them */
12697
6.96k
            osize += squote;
12698
76.0k
        else
12699
76.0k
            quote = '"';
12700
83.0k
    }
12701
7.10M
    osize += 2;   /* quotes */
12702
12703
7.10M
    PyObject *repr = PyUnicode_New(osize, maxch);
12704
7.10M
    if (repr == NULL)
12705
0
        return NULL;
12706
7.10M
    int okind = PyUnicode_KIND(repr);
12707
7.10M
    void *odata = PyUnicode_DATA(repr);
12708
12709
7.10M
    if (!changed) {
12710
4.49M
        PyUnicode_WRITE(okind, odata, 0, quote);
12711
12712
4.49M
        _PyUnicode_FastCopyCharacters(repr, 1,
12713
4.49M
                                      unicode, 0,
12714
4.49M
                                      isize);
12715
12716
4.49M
        PyUnicode_WRITE(okind, odata, osize-1, quote);
12717
4.49M
    }
12718
2.60M
    else {
12719
2.60M
        switch (okind) {
12720
2.40M
        case PyUnicode_1BYTE_KIND:
12721
2.40M
            ucs1lib_repr(unicode, quote, odata);
12722
2.40M
            break;
12723
195k
        case PyUnicode_2BYTE_KIND:
12724
195k
            ucs2lib_repr(unicode, quote, odata);
12725
195k
            break;
12726
5.45k
        default:
12727
5.45k
            assert(okind == PyUnicode_4BYTE_KIND);
12728
5.45k
            ucs4lib_repr(unicode, quote, odata);
12729
2.60M
        }
12730
2.60M
    }
12731
12732
7.10M
    assert(_PyUnicode_CheckConsistency(repr, 1));
12733
7.10M
    return repr;
12734
7.10M
}
12735
12736
/*[clinic input]
12737
@permit_long_summary
12738
str.rfind as unicode_rfind = str.count
12739
12740
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12741
12742
Optional arguments start and end are interpreted as in slice
12743
notation.  Return -1 on failure.
12744
[clinic start generated code]*/
12745
12746
static Py_ssize_t
12747
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12748
                   Py_ssize_t end)
12749
/*[clinic end generated code: output=880b29f01dd014c8 input=2e67789533baf2f5]*/
12750
248k
{
12751
248k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12752
248k
    if (result < 0) {
12753
10.8k
        return -1;
12754
10.8k
    }
12755
237k
    return result;
12756
248k
}
12757
12758
/*[clinic input]
12759
@permit_long_summary
12760
str.rindex as unicode_rindex = str.count
12761
12762
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12763
12764
Optional arguments start and end are interpreted as in slice
12765
notation.  Raises ValueError when the substring is not found.
12766
[clinic start generated code]*/
12767
12768
static Py_ssize_t
12769
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12770
                    Py_ssize_t end)
12771
/*[clinic end generated code: output=5f3aef124c867fe1 input=e29d446c8234c9d9]*/
12772
165k
{
12773
165k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12774
165k
    if (result == -1) {
12775
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12776
0
    }
12777
165k
    else if (result < 0) {
12778
0
        return -1;
12779
0
    }
12780
165k
    return result;
12781
165k
}
12782
12783
/*[clinic input]
12784
str.rjust as unicode_rjust
12785
12786
    width: Py_ssize_t
12787
    fillchar: Py_UCS4 = ' '
12788
    /
12789
12790
Return a right-justified string of length width.
12791
12792
Padding is done using the specified fill character (default is
12793
a space).
12794
[clinic start generated code]*/
12795
12796
static PyObject *
12797
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12798
/*[clinic end generated code: output=804a1a57fbe8d5cf input=1256a8d659589907]*/
12799
0
{
12800
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12801
0
        return unicode_result_unchanged(self);
12802
12803
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12804
0
}
12805
12806
PyObject *
12807
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12808
0
{
12809
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12810
0
        return NULL;
12811
12812
0
    return split(s, sep, maxsplit);
12813
0
}
12814
12815
/*[clinic input]
12816
@permit_long_summary
12817
str.split as unicode_split
12818
12819
    sep: object = None
12820
        The separator used to split the string.
12821
12822
        When set to None (the default value), will split on any
12823
        whitespace character (including \n \r \t \f and spaces) and
12824
        will discard empty strings from the result.
12825
    maxsplit: Py_ssize_t = -1
12826
        Maximum number of splits.
12827
        -1 (the default value) means no limit.
12828
12829
Return a list of the substrings in the string, using sep as the separator string.
12830
12831
Splitting starts at the front of the string and works to the end.
12832
12833
Note, str.split() is mainly useful for data that has been
12834
intentionally delimited.  With natural text that includes
12835
punctuation, consider using the regular expression module.
12836
12837
[clinic start generated code]*/
12838
12839
static PyObject *
12840
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12841
/*[clinic end generated code: output=3a65b1db356948dc input=288cfd6bc8828f5a]*/
12842
22.1M
{
12843
22.1M
    if (sep == Py_None)
12844
175k
        return split(self, NULL, maxsplit);
12845
21.9M
    if (PyUnicode_Check(sep))
12846
21.9M
        return split(self, sep, maxsplit);
12847
12848
0
    PyErr_Format(PyExc_TypeError,
12849
0
                 "must be str or None, not %.100s",
12850
0
                 Py_TYPE(sep)->tp_name);
12851
0
    return NULL;
12852
21.9M
}
12853
12854
PyObject *
12855
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12856
8.69M
{
12857
8.69M
    PyObject* out;
12858
8.69M
    int kind1, kind2;
12859
8.69M
    const void *buf1, *buf2;
12860
8.69M
    Py_ssize_t len1, len2;
12861
12862
8.69M
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12863
0
        return NULL;
12864
12865
8.69M
    kind1 = PyUnicode_KIND(str_obj);
12866
8.69M
    kind2 = PyUnicode_KIND(sep_obj);
12867
8.69M
    len1 = PyUnicode_GET_LENGTH(str_obj);
12868
8.69M
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12869
8.69M
    if (kind1 < kind2 || len1 < len2) {
12870
1.41k
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12871
1.41k
        return PyTuple_Pack(3, str_obj, empty, empty);
12872
1.41k
    }
12873
8.69M
    buf1 = PyUnicode_DATA(str_obj);
12874
8.69M
    buf2 = PyUnicode_DATA(sep_obj);
12875
8.69M
    if (kind2 != kind1) {
12876
86.1k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12877
86.1k
        if (!buf2)
12878
0
            return NULL;
12879
86.1k
    }
12880
12881
8.69M
    switch (kind1) {
12882
8.60M
    case PyUnicode_1BYTE_KIND:
12883
8.60M
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12884
3.02M
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12885
5.57M
        else
12886
5.57M
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12887
8.60M
        break;
12888
73.8k
    case PyUnicode_2BYTE_KIND:
12889
73.8k
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12890
73.8k
        break;
12891
12.2k
    case PyUnicode_4BYTE_KIND:
12892
12.2k
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12893
12.2k
        break;
12894
0
    default:
12895
0
        Py_UNREACHABLE();
12896
8.69M
    }
12897
12898
8.69M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12899
8.69M
    if (kind2 != kind1)
12900
86.1k
        PyMem_Free((void *)buf2);
12901
12902
8.69M
    return out;
12903
8.69M
}
12904
12905
12906
PyObject *
12907
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12908
60.4k
{
12909
60.4k
    PyObject* out;
12910
60.4k
    int kind1, kind2;
12911
60.4k
    const void *buf1, *buf2;
12912
60.4k
    Py_ssize_t len1, len2;
12913
12914
60.4k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12915
0
        return NULL;
12916
12917
60.4k
    kind1 = PyUnicode_KIND(str_obj);
12918
60.4k
    kind2 = PyUnicode_KIND(sep_obj);
12919
60.4k
    len1 = PyUnicode_GET_LENGTH(str_obj);
12920
60.4k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12921
60.4k
    if (kind1 < kind2 || len1 < len2) {
12922
0
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12923
0
        return PyTuple_Pack(3, empty, empty, str_obj);
12924
0
    }
12925
60.4k
    buf1 = PyUnicode_DATA(str_obj);
12926
60.4k
    buf2 = PyUnicode_DATA(sep_obj);
12927
60.4k
    if (kind2 != kind1) {
12928
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12929
0
        if (!buf2)
12930
0
            return NULL;
12931
0
    }
12932
12933
60.4k
    switch (kind1) {
12934
60.4k
    case PyUnicode_1BYTE_KIND:
12935
60.4k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12936
60.4k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12937
0
        else
12938
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12939
60.4k
        break;
12940
0
    case PyUnicode_2BYTE_KIND:
12941
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12942
0
        break;
12943
0
    case PyUnicode_4BYTE_KIND:
12944
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12945
0
        break;
12946
0
    default:
12947
0
        Py_UNREACHABLE();
12948
60.4k
    }
12949
12950
60.4k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12951
60.4k
    if (kind2 != kind1)
12952
0
        PyMem_Free((void *)buf2);
12953
12954
60.4k
    return out;
12955
60.4k
}
12956
12957
/*[clinic input]
12958
str.partition as unicode_partition
12959
12960
    sep: object
12961
    /
12962
12963
Partition the string into three parts using the given separator.
12964
12965
This will search for the separator in the string.  If the separator
12966
is found, returns a 3-tuple containing the part before the
12967
separator, the separator itself, and the part after it.
12968
12969
If the separator is not found, returns a 3-tuple containing
12970
the original string and two empty strings.
12971
[clinic start generated code]*/
12972
12973
static PyObject *
12974
unicode_partition(PyObject *self, PyObject *sep)
12975
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=e45faa8c26270cb1]*/
12976
8.69M
{
12977
8.69M
    return PyUnicode_Partition(self, sep);
12978
8.69M
}
12979
12980
/*[clinic input]
12981
str.rpartition as unicode_rpartition = str.partition
12982
12983
Partition the string into three parts using the given separator.
12984
12985
This will search for the separator in the string, starting at the
12986
end.  If the separator is found, returns a 3-tuple containing the
12987
part before the separator, the separator itself, and the part after
12988
it.
12989
12990
If the separator is not found, returns a 3-tuple containing two
12991
empty strings and the original string.
12992
[clinic start generated code]*/
12993
12994
static PyObject *
12995
unicode_rpartition(PyObject *self, PyObject *sep)
12996
/*[clinic end generated code: output=1aa13cf1156572aa input=53a7f8cb19975b7c]*/
12997
60.4k
{
12998
60.4k
    return PyUnicode_RPartition(self, sep);
12999
60.4k
}
13000
13001
PyObject *
13002
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13003
0
{
13004
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13005
0
        return NULL;
13006
13007
0
    return rsplit(s, sep, maxsplit);
13008
0
}
13009
13010
/*[clinic input]
13011
@permit_long_summary
13012
str.rsplit as unicode_rsplit = str.split
13013
13014
Return a list of the substrings in the string, using sep as the separator string.
13015
13016
Splitting starts at the end of the string and works to the front.
13017
[clinic start generated code]*/
13018
13019
static PyObject *
13020
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13021
/*[clinic end generated code: output=c2b815c63bcabffc input=0f762e30d267fa83]*/
13022
66
{
13023
66
    if (sep == Py_None)
13024
0
        return rsplit(self, NULL, maxsplit);
13025
66
    if (PyUnicode_Check(sep))
13026
66
        return rsplit(self, sep, maxsplit);
13027
13028
0
    PyErr_Format(PyExc_TypeError,
13029
0
                 "must be str or None, not %.100s",
13030
0
                 Py_TYPE(sep)->tp_name);
13031
0
    return NULL;
13032
66
}
13033
13034
/*[clinic input]
13035
@permit_long_summary
13036
str.splitlines as unicode_splitlines
13037
13038
    keepends: bool = False
13039
13040
Return a list of the lines in the string, breaking at line boundaries.
13041
13042
Line breaks are not included in the resulting list unless keepends
13043
is given and true.
13044
[clinic start generated code]*/
13045
13046
static PyObject *
13047
unicode_splitlines_impl(PyObject *self, int keepends)
13048
/*[clinic end generated code: output=f664dcdad153ec40 input=b45ea0f87645a06d]*/
13049
17.4k
{
13050
17.4k
    return PyUnicode_Splitlines(self, keepends);
13051
17.4k
}
13052
13053
static
13054
PyObject *unicode_str(PyObject *self)
13055
2.90M
{
13056
2.90M
    return unicode_result_unchanged(self);
13057
2.90M
}
13058
13059
/*[clinic input]
13060
@permit_long_summary
13061
str.swapcase as unicode_swapcase
13062
13063
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13064
[clinic start generated code]*/
13065
13066
static PyObject *
13067
unicode_swapcase_impl(PyObject *self)
13068
/*[clinic end generated code: output=5d28966bf6d7b2af input=85bc39a9b4e8ee91]*/
13069
0
{
13070
0
    return case_operation(self, do_swapcase);
13071
0
}
13072
13073
static int
13074
unicode_maketrans_from_dict(PyObject *x, PyObject *newdict)
13075
0
{
13076
0
    PyObject *key, *value;
13077
0
    Py_ssize_t i = 0;
13078
0
    int res;
13079
0
    while (PyDict_Next(x, &i, &key, &value)) {
13080
0
        if (PyUnicode_Check(key)) {
13081
0
            PyObject *newkey;
13082
0
            int kind;
13083
0
            const void *data;
13084
0
            if (PyUnicode_GET_LENGTH(key) != 1) {
13085
0
                PyErr_SetString(PyExc_ValueError, "string keys in translate"
13086
0
                                "table must be of length 1");
13087
0
                return -1;
13088
0
            }
13089
0
            kind = PyUnicode_KIND(key);
13090
0
            data = PyUnicode_DATA(key);
13091
0
            newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13092
0
            if (!newkey)
13093
0
                return -1;
13094
0
            res = PyDict_SetItem(newdict, newkey, value);
13095
0
            Py_DECREF(newkey);
13096
0
            if (res < 0)
13097
0
                return -1;
13098
0
        }
13099
0
        else if (PyLong_Check(key)) {
13100
0
            if (PyDict_SetItem(newdict, key, value) < 0)
13101
0
                return -1;
13102
0
        }
13103
0
        else {
13104
0
            PyErr_SetString(PyExc_TypeError, "keys in translate table must"
13105
0
                            "be strings or integers");
13106
0
            return -1;
13107
0
        }
13108
0
    }
13109
0
    return 0;
13110
0
}
13111
13112
/*[clinic input]
13113
13114
@staticmethod
13115
str.maketrans as unicode_maketrans
13116
13117
  x: object
13118
13119
  y: unicode=NULL
13120
13121
  z: unicode=NULL
13122
13123
  /
13124
13125
Return a translation table usable for str.translate().
13126
13127
If there is only one argument, it must be a dictionary mapping
13128
Unicode ordinals (integers) or characters to Unicode ordinals,
13129
strings or None.  Character keys will be then converted to ordinals.
13130
If there are two arguments, they must be strings of equal length,
13131
and in the resulting dictionary, each character in x will be mapped
13132
to the character at the same position in y.  If there is a third
13133
argument, it must be a string, whose characters will be mapped to
13134
None in the result.
13135
[clinic start generated code]*/
13136
13137
static PyObject *
13138
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13139
/*[clinic end generated code: output=a925c89452bd5881 input=66bc00a1b4258a6e]*/
13140
4
{
13141
4
    PyObject *new = NULL, *key, *value;
13142
4
    Py_ssize_t i = 0;
13143
4
    int res;
13144
13145
4
    new = PyDict_New();
13146
4
    if (!new)
13147
0
        return NULL;
13148
4
    if (y != NULL) {
13149
4
        int x_kind, y_kind, z_kind;
13150
4
        const void *x_data, *y_data, *z_data;
13151
13152
        /* x must be a string too, of equal length */
13153
4
        if (!PyUnicode_Check(x)) {
13154
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13155
0
                            "be a string if there is a second argument");
13156
0
            goto err;
13157
0
        }
13158
4
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13159
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13160
0
                            "arguments must have equal length");
13161
0
            goto err;
13162
0
        }
13163
        /* create entries for translating chars in x to those in y */
13164
4
        x_kind = PyUnicode_KIND(x);
13165
4
        y_kind = PyUnicode_KIND(y);
13166
4
        x_data = PyUnicode_DATA(x);
13167
4
        y_data = PyUnicode_DATA(y);
13168
36
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13169
32
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13170
32
            if (!key)
13171
0
                goto err;
13172
32
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13173
32
            if (!value) {
13174
0
                Py_DECREF(key);
13175
0
                goto err;
13176
0
            }
13177
32
            res = PyDict_SetItem(new, key, value);
13178
32
            Py_DECREF(key);
13179
32
            Py_DECREF(value);
13180
32
            if (res < 0)
13181
0
                goto err;
13182
32
        }
13183
        /* create entries for deleting chars in z */
13184
4
        if (z != NULL) {
13185
0
            z_kind = PyUnicode_KIND(z);
13186
0
            z_data = PyUnicode_DATA(z);
13187
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13188
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13189
0
                if (!key)
13190
0
                    goto err;
13191
0
                res = PyDict_SetItem(new, key, Py_None);
13192
0
                Py_DECREF(key);
13193
0
                if (res < 0)
13194
0
                    goto err;
13195
0
            }
13196
0
        }
13197
4
    } else {
13198
        /* x must be a dict */
13199
0
        if (!PyAnyDict_CheckExact(x)) {
13200
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13201
0
                            "to maketrans it must be a dict");
13202
0
            goto err;
13203
0
        }
13204
        /* copy entries into the new dict, converting string keys to int keys */
13205
0
        int errcode;
13206
0
        Py_BEGIN_CRITICAL_SECTION(x);
13207
0
        errcode = unicode_maketrans_from_dict(x, new);
13208
0
        Py_END_CRITICAL_SECTION();
13209
0
        if (errcode < 0)
13210
0
            goto err;
13211
0
    }
13212
4
    return new;
13213
0
  err:
13214
0
    Py_DECREF(new);
13215
0
    return NULL;
13216
4
}
13217
13218
/*[clinic input]
13219
@permit_long_summary
13220
str.translate as unicode_translate
13221
13222
    table: object
13223
        Translation table, which must be a mapping of Unicode ordinals
13224
        to Unicode ordinals, strings, or None.
13225
    /
13226
13227
Replace each character in the string using the given translation table.
13228
13229
The table must implement lookup/indexing via __getitem__, for
13230
instance a dictionary or list.  If this operation raises
13231
LookupError, the character is left untouched.  Characters mapped to
13232
None are deleted.
13233
[clinic start generated code]*/
13234
13235
static PyObject *
13236
unicode_translate(PyObject *self, PyObject *table)
13237
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=48cf0efe06bc1b75]*/
13238
300
{
13239
300
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13240
300
}
13241
13242
/*[clinic input]
13243
str.upper as unicode_upper
13244
13245
Return a copy of the string converted to uppercase.
13246
[clinic start generated code]*/
13247
13248
static PyObject *
13249
unicode_upper_impl(PyObject *self)
13250
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13251
306
{
13252
306
    if (PyUnicode_IS_ASCII(self))
13253
306
        return ascii_upper_or_lower(self, 0);
13254
0
    return case_operation(self, do_upper);
13255
306
}
13256
13257
/*[clinic input]
13258
@permit_long_summary
13259
str.zfill as unicode_zfill
13260
13261
    width: Py_ssize_t
13262
    /
13263
13264
Pad a numeric string with zeros on the left, to fill a field of the given width.
13265
13266
The string is never truncated.
13267
[clinic start generated code]*/
13268
13269
static PyObject *
13270
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13271
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=25a4ee0ea3e58ce0]*/
13272
0
{
13273
0
    Py_ssize_t fill;
13274
0
    PyObject *u;
13275
0
    int kind;
13276
0
    const void *data;
13277
0
    Py_UCS4 chr;
13278
13279
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13280
0
        return unicode_result_unchanged(self);
13281
13282
0
    fill = width - PyUnicode_GET_LENGTH(self);
13283
13284
0
    u = pad(self, fill, 0, '0');
13285
13286
0
    if (u == NULL)
13287
0
        return NULL;
13288
13289
0
    kind = PyUnicode_KIND(u);
13290
0
    data = PyUnicode_DATA(u);
13291
0
    chr = PyUnicode_READ(kind, data, fill);
13292
13293
0
    if (chr == '+' || chr == '-') {
13294
        /* move sign to beginning of string */
13295
0
        PyUnicode_WRITE(kind, data, 0, chr);
13296
0
        PyUnicode_WRITE(kind, data, fill, '0');
13297
0
    }
13298
13299
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13300
0
    return u;
13301
0
}
13302
13303
/*[clinic input]
13304
@permit_long_summary
13305
@text_signature "($self, prefix[, start[, end]], /)"
13306
str.startswith as unicode_startswith
13307
13308
    prefix as subobj: object
13309
        A string or a tuple of strings to try.
13310
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13311
        Optional start position. Default: start of the string.
13312
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13313
        Optional stop position. Default: end of the string.
13314
    /
13315
13316
Return True if the string starts with the specified prefix, False otherwise.
13317
[clinic start generated code]*/
13318
13319
static PyObject *
13320
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13321
                        Py_ssize_t end)
13322
/*[clinic end generated code: output=4bd7cfd0803051d4 input=766bdbd33df251dc]*/
13323
43.7M
{
13324
43.7M
    if (PyTuple_Check(subobj)) {
13325
1.74M
        Py_ssize_t i;
13326
6.40M
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13327
4.67M
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13328
4.67M
            if (!PyUnicode_Check(substring)) {
13329
0
                PyErr_Format(PyExc_TypeError,
13330
0
                             "tuple for startswith must only contain str, "
13331
0
                             "not %.100s",
13332
0
                             Py_TYPE(substring)->tp_name);
13333
0
                return NULL;
13334
0
            }
13335
4.67M
            int result = tailmatch(self, substring, start, end, -1);
13336
4.67M
            if (result < 0) {
13337
0
                return NULL;
13338
0
            }
13339
4.67M
            if (result) {
13340
18.5k
                Py_RETURN_TRUE;
13341
18.5k
            }
13342
4.67M
        }
13343
        /* nothing matched */
13344
1.74M
        Py_RETURN_FALSE;
13345
1.74M
    }
13346
42.0M
    if (!PyUnicode_Check(subobj)) {
13347
0
        PyErr_Format(PyExc_TypeError,
13348
0
                     "startswith first arg must be str or "
13349
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13350
0
        return NULL;
13351
0
    }
13352
42.0M
    int result = tailmatch(self, subobj, start, end, -1);
13353
42.0M
    if (result < 0) {
13354
0
        return NULL;
13355
0
    }
13356
42.0M
    return PyBool_FromLong(result);
13357
42.0M
}
13358
13359
13360
/*[clinic input]
13361
@permit_long_summary
13362
@text_signature "($self, suffix[, start[, end]], /)"
13363
str.endswith as unicode_endswith
13364
13365
    suffix as subobj: object
13366
        A string or a tuple of strings to try.
13367
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13368
        Optional start position. Default: start of the string.
13369
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13370
        Optional stop position. Default: end of the string.
13371
    /
13372
13373
Return True if the string ends with the specified suffix, False otherwise.
13374
[clinic start generated code]*/
13375
13376
static PyObject *
13377
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13378
                      Py_ssize_t end)
13379
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=b66bf6d5547ba1aa]*/
13380
10.9M
{
13381
10.9M
    if (PyTuple_Check(subobj)) {
13382
167k
        Py_ssize_t i;
13383
311k
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13384
287k
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13385
287k
            if (!PyUnicode_Check(substring)) {
13386
0
                PyErr_Format(PyExc_TypeError,
13387
0
                             "tuple for endswith must only contain str, "
13388
0
                             "not %.100s",
13389
0
                             Py_TYPE(substring)->tp_name);
13390
0
                return NULL;
13391
0
            }
13392
287k
            int result = tailmatch(self, substring, start, end, +1);
13393
287k
            if (result < 0) {
13394
0
                return NULL;
13395
0
            }
13396
287k
            if (result) {
13397
143k
                Py_RETURN_TRUE;
13398
143k
            }
13399
287k
        }
13400
167k
        Py_RETURN_FALSE;
13401
167k
    }
13402
10.7M
    if (!PyUnicode_Check(subobj)) {
13403
0
        PyErr_Format(PyExc_TypeError,
13404
0
                     "endswith first arg must be str or "
13405
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13406
0
        return NULL;
13407
0
    }
13408
10.7M
    int result = tailmatch(self, subobj, start, end, +1);
13409
10.7M
    if (result < 0) {
13410
0
        return NULL;
13411
0
    }
13412
10.7M
    return PyBool_FromLong(result);
13413
10.7M
}
13414
13415
13416
#include "stringlib/unicode_format.h"
13417
13418
PyDoc_STRVAR(format__doc__,
13419
             "format($self, /, *args, **kwargs)\n\
13420
--\n\
13421
\n\
13422
Return a formatted version of the string, using substitutions from args and kwargs.\n\
13423
The substitutions are identified by braces ('{' and '}').");
13424
13425
PyDoc_STRVAR(format_map__doc__,
13426
             "format_map($self, mapping, /)\n\
13427
--\n\
13428
\n\
13429
Return a formatted version of the string, using substitutions from mapping.\n\
13430
The substitutions are identified by braces ('{' and '}').");
13431
13432
/*[clinic input]
13433
@permit_long_summary
13434
str.__format__ as unicode___format__
13435
13436
    format_spec: unicode
13437
    /
13438
13439
Return a formatted version of the string as described by format_spec.
13440
[clinic start generated code]*/
13441
13442
static PyObject *
13443
unicode___format___impl(PyObject *self, PyObject *format_spec)
13444
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=77a2a19f3f7969f2]*/
13445
0
{
13446
0
    _PyUnicodeWriter writer;
13447
0
    int ret;
13448
13449
0
    _PyUnicodeWriter_Init(&writer);
13450
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13451
0
                                          self, format_spec, 0,
13452
0
                                          PyUnicode_GET_LENGTH(format_spec));
13453
0
    if (ret == -1) {
13454
0
        _PyUnicodeWriter_Dealloc(&writer);
13455
0
        return NULL;
13456
0
    }
13457
0
    return _PyUnicodeWriter_Finish(&writer);
13458
0
}
13459
13460
/*[clinic input]
13461
str.__sizeof__ as unicode_sizeof
13462
13463
Return the size of the string in memory, in bytes.
13464
[clinic start generated code]*/
13465
13466
static PyObject *
13467
unicode_sizeof_impl(PyObject *self)
13468
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13469
0
{
13470
0
    Py_ssize_t size;
13471
13472
    /* If it's a compact object, account for base structure +
13473
       character data. */
13474
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
13475
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13476
0
    }
13477
0
    else if (PyUnicode_IS_COMPACT(self)) {
13478
0
        size = sizeof(PyCompactUnicodeObject) +
13479
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13480
0
    }
13481
0
    else {
13482
        /* If it is a two-block object, account for base object, and
13483
           for character block if present. */
13484
0
        size = sizeof(PyUnicodeObject);
13485
0
        if (_PyUnicode_DATA_ANY(self))
13486
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
13487
0
                PyUnicode_KIND(self);
13488
0
    }
13489
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
13490
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
13491
13492
0
    return PyLong_FromSsize_t(size);
13493
0
}
13494
13495
static PyObject *
13496
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13497
0
{
13498
0
    PyObject *copy = _PyUnicode_Copy(v);
13499
0
    if (!copy)
13500
0
        return NULL;
13501
0
    return Py_BuildValue("(N)", copy);
13502
0
}
13503
13504
/*
13505
This function searchs the longest common leading whitespace
13506
of all lines in the [src, end).
13507
It returns the length of the common leading whitespace and sets `output` to
13508
point to the beginning of the common leading whitespace if length > 0.
13509
*/
13510
static Py_ssize_t
13511
search_longest_common_leading_whitespace(
13512
    const char *const src,
13513
    const char *const end,
13514
    const char **output)
13515
0
{
13516
    // [_start, _start + _len)
13517
    // describes the current longest common leading whitespace
13518
0
    const char *_start = NULL;
13519
0
    Py_ssize_t _len = 0;
13520
13521
0
    for (const char *iter = src; iter < end; ++iter) {
13522
0
        const char *line_start = iter;
13523
0
        const char *leading_whitespace_end = NULL;
13524
13525
        // scan the whole line
13526
0
        while (iter < end && *iter != '\n') {
13527
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
13528
                /* `iter` points to the first non-whitespace character
13529
                   in this line */
13530
0
                if (iter == line_start) {
13531
                    // some line has no indent, fast exit!
13532
0
                    return 0;
13533
0
                }
13534
0
                leading_whitespace_end = iter;
13535
0
            }
13536
0
            ++iter;
13537
0
        }
13538
13539
        // if this line has all white space, skip it
13540
0
        if (!leading_whitespace_end) {
13541
0
            continue;
13542
0
        }
13543
13544
0
        if (!_start) {
13545
            // update the first leading whitespace
13546
0
            _start = line_start;
13547
0
            _len = leading_whitespace_end - line_start;
13548
0
            assert(_len > 0);
13549
0
        }
13550
0
        else {
13551
            /* We then compare with the current longest leading whitespace.
13552
13553
               [line_start, leading_whitespace_end) is the leading
13554
               whitespace of this line,
13555
13556
               [_start, _start + _len) is the leading whitespace of the
13557
               current longest leading whitespace. */
13558
0
            Py_ssize_t new_len = 0;
13559
0
            const char *_iter = _start, *line_iter = line_start;
13560
13561
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
13562
0
                   && *_iter == *line_iter)
13563
0
            {
13564
0
                ++_iter;
13565
0
                ++line_iter;
13566
0
                ++new_len;
13567
0
            }
13568
13569
0
            _len = new_len;
13570
0
            if (_len == 0) {
13571
                // No common things now, fast exit!
13572
0
                return 0;
13573
0
            }
13574
0
        }
13575
0
    }
13576
13577
0
    assert(_len >= 0);
13578
0
    if (_len > 0) {
13579
0
        *output = _start;
13580
0
    }
13581
0
    return _len;
13582
0
}
13583
13584
/* Dedent a string.
13585
   Intended to dedent Python source. Unlike `textwrap.dedent`, this
13586
   only supports spaces and tabs and doesn't normalize empty lines.
13587
   Return a new reference on success, NULL with exception set on error.
13588
   */
13589
PyObject *
13590
_PyUnicode_Dedent(PyObject *unicode)
13591
0
{
13592
0
    Py_ssize_t src_len = 0;
13593
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
13594
0
    if (!src) {
13595
0
        return NULL;
13596
0
    }
13597
0
    assert(src_len >= 0);
13598
0
    if (src_len == 0) {
13599
0
        return Py_NewRef(unicode);
13600
0
    }
13601
13602
0
    const char *const end = src + src_len;
13603
13604
    // [whitespace_start, whitespace_start + whitespace_len)
13605
    // describes the current longest common leading whitespace
13606
0
    const char *whitespace_start = NULL;
13607
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
13608
0
        src, end, &whitespace_start);
13609
13610
0
    if (whitespace_len == 0) {
13611
0
        return Py_NewRef(unicode);
13612
0
    }
13613
13614
    // now we should trigger a dedent
13615
0
    char *dest = PyMem_Malloc(src_len);
13616
0
    if (!dest) {
13617
0
        PyErr_NoMemory();
13618
0
        return NULL;
13619
0
    }
13620
0
    char *dest_iter = dest;
13621
13622
0
    for (const char *iter = src; iter < end; ++iter) {
13623
0
        const char *line_start = iter;
13624
0
        bool in_leading_space = true;
13625
13626
        // iterate over a line to find the end of a line
13627
0
        while (iter < end && *iter != '\n') {
13628
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
13629
0
                in_leading_space = false;
13630
0
            }
13631
0
            ++iter;
13632
0
        }
13633
13634
        // invariant: *iter == '\n' or iter == end
13635
0
        bool append_newline = iter < end;
13636
13637
        // if this line has all white space, write '\n' and continue
13638
0
        if (in_leading_space && append_newline) {
13639
0
            *dest_iter++ = '\n';
13640
0
            continue;
13641
0
        }
13642
13643
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
13644
            conditionally append '\n' */
13645
13646
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
13647
0
        assert(new_line_len >= 0);
13648
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
13649
13650
0
        dest_iter += new_line_len;
13651
13652
0
        if (append_newline) {
13653
0
            *dest_iter++ = '\n';
13654
0
        }
13655
0
    }
13656
13657
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
13658
0
    PyMem_Free(dest);
13659
0
    return res;
13660
0
}
13661
13662
static PyMethodDef unicode_methods[] = {
13663
    UNICODE_ENCODE_METHODDEF
13664
    UNICODE_REPLACE_METHODDEF
13665
    UNICODE_SPLIT_METHODDEF
13666
    UNICODE_RSPLIT_METHODDEF
13667
    UNICODE_JOIN_METHODDEF
13668
    UNICODE_CAPITALIZE_METHODDEF
13669
    UNICODE_CASEFOLD_METHODDEF
13670
    UNICODE_TITLE_METHODDEF
13671
    UNICODE_CENTER_METHODDEF
13672
    UNICODE_COUNT_METHODDEF
13673
    UNICODE_EXPANDTABS_METHODDEF
13674
    UNICODE_FIND_METHODDEF
13675
    UNICODE_PARTITION_METHODDEF
13676
    UNICODE_INDEX_METHODDEF
13677
    UNICODE_LJUST_METHODDEF
13678
    UNICODE_LOWER_METHODDEF
13679
    UNICODE_LSTRIP_METHODDEF
13680
    UNICODE_RFIND_METHODDEF
13681
    UNICODE_RINDEX_METHODDEF
13682
    UNICODE_RJUST_METHODDEF
13683
    UNICODE_RSTRIP_METHODDEF
13684
    UNICODE_RPARTITION_METHODDEF
13685
    UNICODE_SPLITLINES_METHODDEF
13686
    UNICODE_STRIP_METHODDEF
13687
    UNICODE_SWAPCASE_METHODDEF
13688
    UNICODE_TRANSLATE_METHODDEF
13689
    UNICODE_UPPER_METHODDEF
13690
    UNICODE_STARTSWITH_METHODDEF
13691
    UNICODE_ENDSWITH_METHODDEF
13692
    UNICODE_REMOVEPREFIX_METHODDEF
13693
    UNICODE_REMOVESUFFIX_METHODDEF
13694
    UNICODE_ISASCII_METHODDEF
13695
    UNICODE_ISLOWER_METHODDEF
13696
    UNICODE_ISUPPER_METHODDEF
13697
    UNICODE_ISTITLE_METHODDEF
13698
    UNICODE_ISSPACE_METHODDEF
13699
    UNICODE_ISDECIMAL_METHODDEF
13700
    UNICODE_ISDIGIT_METHODDEF
13701
    UNICODE_ISNUMERIC_METHODDEF
13702
    UNICODE_ISALPHA_METHODDEF
13703
    UNICODE_ISALNUM_METHODDEF
13704
    UNICODE_ISIDENTIFIER_METHODDEF
13705
    UNICODE_ISPRINTABLE_METHODDEF
13706
    UNICODE_ZFILL_METHODDEF
13707
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
13708
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
13709
    UNICODE___FORMAT___METHODDEF
13710
    UNICODE_MAKETRANS_METHODDEF
13711
    UNICODE_SIZEOF_METHODDEF
13712
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
13713
    {NULL, NULL}
13714
};
13715
13716
static PyObject *
13717
unicode_mod(PyObject *v, PyObject *w)
13718
14.1M
{
13719
14.1M
    if (!PyUnicode_Check(v))
13720
0
        Py_RETURN_NOTIMPLEMENTED;
13721
14.1M
    return PyUnicode_Format(v, w);
13722
14.1M
}
13723
13724
static PyNumberMethods unicode_as_number = {
13725
    0,              /*nb_add*/
13726
    0,              /*nb_subtract*/
13727
    0,              /*nb_multiply*/
13728
    unicode_mod,            /*nb_remainder*/
13729
};
13730
13731
static PySequenceMethods unicode_as_sequence = {
13732
    unicode_length,     /* sq_length */
13733
    PyUnicode_Concat,   /* sq_concat */
13734
    _PyUnicode_Repeat,  /* sq_repeat */
13735
    unicode_getitem,    /* sq_item */
13736
    0,                  /* sq_slice */
13737
    0,                  /* sq_ass_item */
13738
    0,                  /* sq_ass_slice */
13739
    PyUnicode_Contains, /* sq_contains */
13740
};
13741
13742
static PyObject*
13743
unicode_subscript(PyObject* self, PyObject* item)
13744
80.9M
{
13745
80.9M
    if (_PyIndex_Check(item)) {
13746
61.7M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13747
61.7M
        if (i == -1 && PyErr_Occurred())
13748
0
            return NULL;
13749
61.7M
        if (i < 0)
13750
65.6k
            i += PyUnicode_GET_LENGTH(self);
13751
61.7M
        return unicode_getitem(self, i);
13752
61.7M
    } else if (PySlice_Check(item)) {
13753
19.1M
        Py_ssize_t start, stop, step, slicelength, i;
13754
19.1M
        size_t cur;
13755
19.1M
        PyObject *result;
13756
19.1M
        const void *src_data;
13757
19.1M
        void *dest_data;
13758
19.1M
        int src_kind, dest_kind;
13759
19.1M
        Py_UCS4 ch, max_char, kind_limit;
13760
13761
19.1M
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
13762
0
            return NULL;
13763
0
        }
13764
19.1M
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13765
19.1M
                                            &start, &stop, step);
13766
13767
19.1M
        if (slicelength <= 0) {
13768
1.30M
            _Py_RETURN_UNICODE_EMPTY();
13769
17.8M
        } else if (start == 0 && step == 1 &&
13770
6.75M
                   slicelength == PyUnicode_GET_LENGTH(self)) {
13771
5.04M
            return unicode_result_unchanged(self);
13772
12.8M
        } else if (step == 1) {
13773
12.8M
            return PyUnicode_Substring(self,
13774
12.8M
                                       start, start + slicelength);
13775
12.8M
        }
13776
        /* General case */
13777
0
        src_kind = PyUnicode_KIND(self);
13778
0
        src_data = PyUnicode_DATA(self);
13779
0
        if (!PyUnicode_IS_ASCII(self)) {
13780
0
            kind_limit = kind_maxchar_limit(src_kind);
13781
0
            max_char = 0;
13782
0
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13783
0
                ch = PyUnicode_READ(src_kind, src_data, cur);
13784
0
                if (ch > max_char) {
13785
0
                    max_char = ch;
13786
0
                    if (max_char >= kind_limit)
13787
0
                        break;
13788
0
                }
13789
0
            }
13790
0
        }
13791
0
        else
13792
0
            max_char = 127;
13793
0
        result = PyUnicode_New(slicelength, max_char);
13794
0
        if (result == NULL)
13795
0
            return NULL;
13796
0
        dest_kind = PyUnicode_KIND(result);
13797
0
        dest_data = PyUnicode_DATA(result);
13798
13799
0
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13800
0
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13801
0
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13802
0
        }
13803
0
        assert(_PyUnicode_CheckConsistency(result, 1));
13804
0
        return result;
13805
0
    } else {
13806
0
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
13807
0
                     Py_TYPE(item)->tp_name);
13808
0
        return NULL;
13809
0
    }
13810
80.9M
}
13811
13812
static PyMappingMethods unicode_as_mapping = {
13813
    unicode_length,     /* mp_length */
13814
    unicode_subscript,  /* mp_subscript */
13815
    0,                  /* mp_ass_subscript */
13816
};
13817
13818
13819
static PyObject *
13820
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
13821
13822
/*[clinic input]
13823
@classmethod
13824
str.__new__ as unicode_new
13825
13826
    object as x: object = NULL
13827
    encoding: str = NULL
13828
    errors: str = NULL
13829
13830
[clinic start generated code]*/
13831
13832
static PyObject *
13833
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
13834
                 const char *errors)
13835
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
13836
9.81M
{
13837
9.81M
    PyObject *unicode;
13838
9.81M
    if (x == NULL) {
13839
0
        unicode = _PyUnicode_GetEmpty();
13840
0
    }
13841
9.81M
    else if (encoding == NULL && errors == NULL) {
13842
9.81M
        unicode = PyObject_Str(x);
13843
9.81M
    }
13844
0
    else {
13845
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
13846
0
    }
13847
13848
9.81M
    if (unicode != NULL && type != &PyUnicode_Type) {
13849
9.81M
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
13850
9.81M
    }
13851
9.81M
    return unicode;
13852
9.81M
}
13853
13854
static const char *
13855
arg_as_utf8(PyObject *obj, const char *name)
13856
2.86M
{
13857
2.86M
    if (!PyUnicode_Check(obj)) {
13858
0
        PyErr_Format(PyExc_TypeError,
13859
0
                     "str() argument '%s' must be str, not %T",
13860
0
                     name, obj);
13861
0
        return NULL;
13862
0
    }
13863
2.86M
    return _PyUnicode_AsUTF8NoNUL(obj);
13864
2.86M
}
13865
13866
static PyObject *
13867
unicode_vectorcall(PyObject *type, PyObject *const *args,
13868
                   size_t nargsf, PyObject *kwnames)
13869
2.53M
{
13870
2.53M
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
13871
13872
2.53M
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
13873
2.53M
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
13874
        // Fallback to unicode_new()
13875
0
        PyObject *tuple = PyTuple_FromArray(args, nargs);
13876
0
        if (tuple == NULL) {
13877
0
            return NULL;
13878
0
        }
13879
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
13880
0
        if (dict == NULL) {
13881
0
            Py_DECREF(tuple);
13882
0
            return NULL;
13883
0
        }
13884
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
13885
0
        Py_DECREF(tuple);
13886
0
        Py_DECREF(dict);
13887
0
        return ret;
13888
0
    }
13889
2.53M
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
13890
0
        return NULL;
13891
0
    }
13892
2.53M
    if (nargs == 0) {
13893
4.85k
        return _PyUnicode_GetEmpty();
13894
4.85k
    }
13895
2.52M
    PyObject *object = args[0];
13896
2.52M
    if (nargs == 1) {
13897
1.16k
        return PyObject_Str(object);
13898
1.16k
    }
13899
2.52M
    const char *encoding = arg_as_utf8(args[1], "encoding");
13900
2.52M
    if (encoding == NULL) {
13901
0
        return NULL;
13902
0
    }
13903
2.52M
    const char *errors = NULL;
13904
2.52M
    if (nargs == 3) {
13905
339k
        errors = arg_as_utf8(args[2], "errors");
13906
339k
        if (errors == NULL) {
13907
0
            return NULL;
13908
0
        }
13909
339k
    }
13910
2.52M
    return PyUnicode_FromEncodedObject(object, encoding, errors);
13911
2.52M
}
13912
13913
static PyObject *
13914
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
13915
9.81M
{
13916
9.81M
    PyObject *self;
13917
9.81M
    Py_ssize_t length, char_size;
13918
9.81M
    int share_utf8;
13919
9.81M
    int kind;
13920
9.81M
    void *data;
13921
13922
9.81M
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13923
9.81M
    assert(_PyUnicode_CHECK(unicode));
13924
13925
9.81M
    self = type->tp_alloc(type, 0);
13926
9.81M
    if (self == NULL) {
13927
0
        return NULL;
13928
0
    }
13929
9.81M
    kind = PyUnicode_KIND(unicode);
13930
9.81M
    length = PyUnicode_GET_LENGTH(unicode);
13931
13932
9.81M
    _PyUnicode_LENGTH(self) = length;
13933
#ifdef Py_DEBUG
13934
    _PyUnicode_HASH(self) = -1;
13935
#else
13936
9.81M
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13937
9.81M
#endif
13938
9.81M
    _PyUnicode_STATE(self).interned = 0;
13939
9.81M
    _PyUnicode_STATE(self).kind = kind;
13940
9.81M
    _PyUnicode_STATE(self).compact = 0;
13941
9.81M
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13942
9.81M
    _PyUnicode_STATE(self).statically_allocated = 0;
13943
9.81M
    PyUnicode_SET_UTF8_LENGTH(self, 0);
13944
9.81M
    PyUnicode_SET_UTF8(self, NULL);
13945
9.81M
    _PyUnicode_DATA_ANY(self) = NULL;
13946
13947
9.81M
    share_utf8 = 0;
13948
9.81M
    if (kind == PyUnicode_1BYTE_KIND) {
13949
8.69M
        char_size = 1;
13950
8.69M
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13951
8.66M
            share_utf8 = 1;
13952
8.69M
    }
13953
1.11M
    else if (kind == PyUnicode_2BYTE_KIND) {
13954
1.06M
        char_size = 2;
13955
1.06M
    }
13956
48.7k
    else {
13957
48.7k
        assert(kind == PyUnicode_4BYTE_KIND);
13958
48.7k
        char_size = 4;
13959
48.7k
    }
13960
13961
    /* Ensure we won't overflow the length. */
13962
9.81M
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13963
0
        PyErr_NoMemory();
13964
0
        goto onError;
13965
0
    }
13966
9.81M
    data = PyMem_Malloc((length + 1) * char_size);
13967
9.81M
    if (data == NULL) {
13968
0
        PyErr_NoMemory();
13969
0
        goto onError;
13970
0
    }
13971
13972
9.81M
    _PyUnicode_DATA_ANY(self) = data;
13973
9.81M
    if (share_utf8) {
13974
8.66M
        PyUnicode_SET_UTF8_LENGTH(self, length);
13975
8.66M
        PyUnicode_SET_UTF8(self, data);
13976
8.66M
    }
13977
13978
9.81M
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
13979
9.81M
    assert(_PyUnicode_CheckConsistency(self, 1));
13980
#ifdef Py_DEBUG
13981
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13982
#endif
13983
9.81M
    return self;
13984
13985
0
onError:
13986
0
    Py_DECREF(self);
13987
0
    return NULL;
13988
9.81M
}
13989
13990
static _PyObjectIndexPair
13991
unicode_iteritem(PyObject *obj, Py_ssize_t index)
13992
45.2M
{
13993
45.2M
    if (index >= PyUnicode_GET_LENGTH(obj)) {
13994
3.94M
        return (_PyObjectIndexPair) { .object = NULL, .index = index };
13995
3.94M
    }
13996
41.3M
    const void *data = PyUnicode_DATA(obj);
13997
41.3M
    int kind = PyUnicode_KIND(obj);
13998
41.3M
    Py_UCS4 ch = PyUnicode_READ(kind, data, index);
13999
41.3M
    PyObject *result = unicode_char(ch);
14000
41.3M
    index = (result == NULL) ? -1 : index + 1;
14001
41.3M
    return (_PyObjectIndexPair) { .object = result, .index = index };
14002
45.2M
}
14003
14004
void
14005
_PyUnicode_ExactDealloc(PyObject *op)
14006
74.9M
{
14007
74.9M
    assert(PyUnicode_CheckExact(op));
14008
74.9M
    unicode_dealloc(op);
14009
74.9M
}
14010
14011
PyDoc_STRVAR(unicode_doc,
14012
"str(object='') -> str\n\
14013
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14014
\n\
14015
Create a new string object from the given object. If encoding or\n\
14016
errors is specified, then the object must expose a data buffer\n\
14017
that will be decoded using the given encoding and error handler.\n\
14018
Otherwise, returns the result of object.__str__() (if defined)\n\
14019
or repr(object).\n\
14020
encoding defaults to 'utf-8'.\n\
14021
errors defaults to 'strict'.");
14022
14023
static PyObject *unicode_iter(PyObject *seq);
14024
14025
PyTypeObject PyUnicode_Type = {
14026
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14027
    "str",                        /* tp_name */
14028
    sizeof(PyUnicodeObject),      /* tp_basicsize */
14029
    0,                            /* tp_itemsize */
14030
    /* Slots */
14031
    unicode_dealloc,              /* tp_dealloc */
14032
    0,                            /* tp_vectorcall_offset */
14033
    0,                            /* tp_getattr */
14034
    0,                            /* tp_setattr */
14035
    0,                            /* tp_as_async */
14036
    unicode_repr,                 /* tp_repr */
14037
    &unicode_as_number,           /* tp_as_number */
14038
    &unicode_as_sequence,         /* tp_as_sequence */
14039
    &unicode_as_mapping,          /* tp_as_mapping */
14040
    unicode_hash,                 /* tp_hash*/
14041
    0,                            /* tp_call*/
14042
    unicode_str,                  /* tp_str */
14043
    PyObject_GenericGetAttr,      /* tp_getattro */
14044
    0,                            /* tp_setattro */
14045
    0,                            /* tp_as_buffer */
14046
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14047
        Py_TPFLAGS_UNICODE_SUBCLASS |
14048
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
14049
    unicode_doc,                  /* tp_doc */
14050
    0,                            /* tp_traverse */
14051
    0,                            /* tp_clear */
14052
    PyUnicode_RichCompare,        /* tp_richcompare */
14053
    0,                            /* tp_weaklistoffset */
14054
    unicode_iter,                 /* tp_iter */
14055
    0,                            /* tp_iternext */
14056
    unicode_methods,              /* tp_methods */
14057
    0,                            /* tp_members */
14058
    0,                            /* tp_getset */
14059
    0,                            /* tp_base */
14060
    0,                            /* tp_dict */
14061
    0,                            /* tp_descr_get */
14062
    0,                            /* tp_descr_set */
14063
    0,                            /* tp_dictoffset */
14064
    0,                            /* tp_init */
14065
    0,                            /* tp_alloc */
14066
    unicode_new,                  /* tp_new */
14067
    PyObject_Free,                /* tp_free */
14068
    .tp_vectorcall = unicode_vectorcall,
14069
    ._tp_iteritem = unicode_iteritem,
14070
};
14071
14072
/* Initialize the Unicode implementation */
14073
14074
static void
14075
_init_global_state(void)
14076
36
{
14077
36
    static int initialized = 0;
14078
36
    if (initialized) {
14079
0
        return;
14080
0
    }
14081
36
    initialized = 1;
14082
14083
    /* initialize the linebreak bloom filter */
14084
36
    const Py_UCS2 linebreak[] = {
14085
36
        0x000A, /* LINE FEED */
14086
36
        0x000D, /* CARRIAGE RETURN */
14087
36
        0x001C, /* FILE SEPARATOR */
14088
36
        0x001D, /* GROUP SEPARATOR */
14089
36
        0x001E, /* RECORD SEPARATOR */
14090
36
        0x0085, /* NEXT LINE */
14091
36
        0x2028, /* LINE SEPARATOR */
14092
36
        0x2029, /* PARAGRAPH SEPARATOR */
14093
36
    };
14094
36
    bloom_linebreak = make_bloom_mask(
14095
36
        PyUnicode_2BYTE_KIND, linebreak,
14096
36
        Py_ARRAY_LENGTH(linebreak));
14097
36
}
14098
14099
void
14100
_PyUnicode_InitState(PyInterpreterState *interp)
14101
36
{
14102
36
    if (!_Py_IsMainInterpreter(interp)) {
14103
0
        return;
14104
0
    }
14105
36
    _init_global_state();
14106
36
}
14107
14108
14109
PyStatus
14110
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
14111
36
{
14112
36
    if (_Py_IsMainInterpreter(interp)) {
14113
36
        PyStatus status = init_global_interned_strings(interp);
14114
36
        if (_PyStatus_EXCEPTION(status)) {
14115
0
            return status;
14116
0
        }
14117
36
    }
14118
36
    assert(INTERNED_STRINGS);
14119
14120
36
    if (init_interned_dict(interp)) {
14121
0
        PyErr_Clear();
14122
0
        return _PyStatus_ERR("failed to create interned dict");
14123
0
    }
14124
14125
36
    return _PyStatus_OK();
14126
36
}
14127
14128
14129
PyStatus
14130
_PyUnicode_InitTypes(PyInterpreterState *interp)
14131
36
{
14132
36
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
14133
0
        goto error;
14134
0
    }
14135
36
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
14136
0
        goto error;
14137
0
    }
14138
36
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
14139
0
        goto error;
14140
0
    }
14141
36
    return _PyStatus_OK();
14142
14143
0
error:
14144
0
    return _PyStatus_ERR("Can't initialize unicode types");
14145
36
}
14146
14147
static /* non-null */ PyObject*
14148
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
14149
40.3k
{
14150
    // Note that this steals a reference to `s`, but in many cases that
14151
    // stolen ref is returned, requiring no decref/incref.
14152
14153
40.3k
    assert(s != NULL);
14154
40.3k
    assert(_PyUnicode_CHECK(s));
14155
40.3k
    assert(_PyUnicode_STATE(s).statically_allocated);
14156
40.3k
    assert(!PyUnicode_CHECK_INTERNED(s));
14157
14158
#ifdef Py_DEBUG
14159
    /* We must not add process-global interned string if there's already a
14160
     * per-interpreter interned_dict, which might contain duplicates.
14161
     */
14162
    PyObject *interned = get_interned_dict(interp);
14163
    assert(interned == NULL);
14164
#endif
14165
14166
    /* Look in the global cache first. */
14167
40.3k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14168
    /* We should only init each string once */
14169
40.3k
    assert(r == NULL);
14170
    /* but just in case (for the non-debug build), handle this */
14171
40.3k
    if (r != NULL && r != s) {
14172
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
14173
0
        assert(_PyUnicode_CHECK(r));
14174
0
        Py_DECREF(s);
14175
0
        return Py_NewRef(r);
14176
0
    }
14177
14178
40.3k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
14179
0
        Py_FatalError("failed to intern static string");
14180
0
    }
14181
14182
40.3k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
14183
40.3k
    return s;
14184
40.3k
}
14185
14186
void
14187
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
14188
40.3k
{
14189
    // This should only be called as part of runtime initialization
14190
40.3k
    assert(!Py_IsInitialized());
14191
14192
40.3k
    *p = intern_static(interp, *p);
14193
40.3k
    assert(*p);
14194
40.3k
}
14195
14196
static void
14197
immortalize_interned(PyObject *s)
14198
281k
{
14199
281k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
14200
281k
    assert(!_Py_IsImmortal(s));
14201
#ifdef Py_REF_DEBUG
14202
    /* The reference count value should be excluded from the RefTotal.
14203
       The decrements to these objects will not be registered so they
14204
       need to be accounted for in here. */
14205
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
14206
        _Py_DecRefTotal(_PyThreadState_GET());
14207
    }
14208
#endif
14209
281k
    _Py_SetImmortal(s);
14210
    // The switch to SSTATE_INTERNED_IMMORTAL must be the last thing done here
14211
    // to synchronize with the check in intern_common() that avoids locking if
14212
    // the string is already immortal.
14213
281k
    FT_ATOMIC_STORE_UINT8(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
14214
281k
}
14215
14216
#ifdef Py_GIL_DISABLED
14217
static bool
14218
can_immortalize_safely(PyObject *s)
14219
{
14220
    if (_Py_IsOwnedByCurrentThread(s) || _Py_IsImmortal(s)) {
14221
        return true;
14222
    }
14223
    Py_ssize_t shared = _Py_atomic_load_ssize(&s->ob_ref_shared);
14224
    return _Py_REF_IS_MERGED(shared);
14225
}
14226
#endif
14227
14228
static /* non-null */ PyObject*
14229
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
14230
              bool immortalize)
14231
66.6M
{
14232
    // Note that this steals a reference to `s`, but in many cases that
14233
    // stolen ref is returned, requiring no decref/incref.
14234
14235
#ifdef Py_DEBUG
14236
    assert(s != NULL);
14237
    assert(_PyUnicode_CHECK(s));
14238
#else
14239
66.6M
    if (s == NULL || !PyUnicode_Check(s)) {
14240
0
        return s;
14241
0
    }
14242
66.6M
#endif
14243
14244
    /* If it's a subclass, we don't really know what putting
14245
       it in the interned dict might do. */
14246
66.6M
    if (!PyUnicode_CheckExact(s)) {
14247
0
        return s;
14248
0
    }
14249
14250
    /* Is it already interned? */
14251
66.6M
    switch (PyUnicode_CHECK_INTERNED(s)) {
14252
5.93M
        case SSTATE_NOT_INTERNED:
14253
            // no, go on
14254
5.93M
            break;
14255
28.7k
        case SSTATE_INTERNED_MORTAL:
14256
28.7k
#ifndef Py_GIL_DISABLED
14257
            // yes but we might need to make it immortal
14258
28.7k
            if (immortalize) {
14259
1.79k
                immortalize_interned(s);
14260
1.79k
            }
14261
28.7k
            return s;
14262
#else
14263
            // not fully interned yet; fall through to the locking path
14264
            break;
14265
#endif
14266
60.6M
        default:
14267
            // all done
14268
60.6M
            return s;
14269
66.6M
    }
14270
14271
    /* Statically allocated strings must be already interned. */
14272
66.6M
    assert(!_PyUnicode_STATE(s).statically_allocated);
14273
14274
#if Py_GIL_DISABLED
14275
    /* In the free-threaded build, all interned strings are immortal */
14276
    immortalize = 1;
14277
#endif
14278
14279
    /* If it's already immortal, intern it as such */
14280
5.93M
    if (_Py_IsImmortal(s)) {
14281
0
        immortalize = 1;
14282
0
    }
14283
14284
    /* if it's a short string, get the singleton */
14285
5.93M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
14286
16.4k
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
14287
0
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
14288
0
        assert(PyUnicode_CHECK_INTERNED(r));
14289
0
        Py_DECREF(s);
14290
0
        return r;
14291
0
    }
14292
#ifdef Py_DEBUG
14293
    assert(!unicode_is_singleton(s));
14294
#endif
14295
14296
    /* Look in the global cache now. */
14297
5.93M
    {
14298
5.93M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14299
5.93M
        if (r != NULL) {
14300
514k
            assert(_PyUnicode_STATE(r).statically_allocated);
14301
514k
            assert(r != s);  // r must be statically_allocated; s is not
14302
514k
            Py_DECREF(s);
14303
514k
            return Py_NewRef(r);
14304
514k
        }
14305
5.93M
    }
14306
14307
    /* Do a setdefault on the per-interpreter cache. */
14308
5.41M
    PyObject *interned = get_interned_dict(interp);
14309
5.41M
    assert(interned != NULL);
14310
#ifdef Py_GIL_DISABLED
14311
#  define INTERN_MUTEX &_Py_INTERP_CACHED_OBJECT(interp, interned_mutex)
14312
    // Lock-free fast path: check if there's already an interned copy that
14313
    // is in its final immortal state.
14314
    PyObject *r;
14315
    int res = PyDict_GetItemRef(interned, s, &r);
14316
    if (res < 0) {
14317
        PyErr_Clear();
14318
        return s;
14319
    }
14320
    if (res > 0) {
14321
        unsigned int state = _Py_atomic_load_uint8(&_PyUnicode_STATE(r).interned);
14322
        if (state == SSTATE_INTERNED_IMMORTAL) {
14323
            Py_DECREF(s);
14324
            return r;
14325
        }
14326
        // Not yet fully interned; fall through to the locking path.
14327
        Py_DECREF(r);
14328
    }
14329
#endif
14330
14331
#ifdef Py_GIL_DISABLED
14332
    // Immortalization writes to the refcount fields non-atomically. That
14333
    // races with Py_INCREF / Py_DECREF on the thread that owns `s`. If we
14334
    // don't own it (and its refcount hasn't been merged), intern a copy
14335
    // we own instead.
14336
    if (!can_immortalize_safely(s)) {
14337
        PyObject *copy = _PyUnicode_Copy(s);
14338
        if (copy == NULL) {
14339
            PyErr_Clear();
14340
            return s;
14341
        }
14342
        Py_DECREF(s);
14343
        s = copy;
14344
    }
14345
#endif
14346
14347
5.41M
    FT_MUTEX_LOCK(INTERN_MUTEX);
14348
5.41M
    PyObject *t;
14349
5.41M
    {
14350
5.41M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
14351
5.41M
        if (res < 0) {
14352
0
            PyErr_Clear();
14353
0
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14354
0
            return s;
14355
0
        }
14356
5.41M
        else if (res == 1) {
14357
            // value was already present (not inserted)
14358
4.90M
            Py_DECREF(s);
14359
4.90M
            if (immortalize &&
14360
1.13M
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
14361
9.74k
                immortalize_interned(t);
14362
9.74k
            }
14363
4.90M
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14364
4.90M
            return t;
14365
4.90M
        }
14366
511k
        else {
14367
            // value was newly inserted
14368
511k
            assert (s == t);
14369
511k
            Py_DECREF(t);
14370
511k
        }
14371
5.41M
    }
14372
14373
    /* NOT_INTERNED -> INTERNED_MORTAL */
14374
14375
5.41M
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
14376
14377
511k
    if (!_Py_IsImmortal(s)) {
14378
        /* The two references in interned dict (key and value) are not counted.
14379
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
14380
511k
        Py_DECREF(s);
14381
511k
        Py_DECREF(s);
14382
511k
    }
14383
511k
    FT_ATOMIC_STORE_UINT8(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
14384
14385
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
14386
14387
#ifdef Py_DEBUG
14388
    if (_Py_IsImmortal(s)) {
14389
        assert(immortalize);
14390
    }
14391
#endif
14392
511k
    if (immortalize) {
14393
270k
        immortalize_interned(s);
14394
270k
    }
14395
14396
511k
    FT_MUTEX_UNLOCK(INTERN_MUTEX);
14397
511k
    return s;
14398
5.41M
}
14399
14400
void
14401
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
14402
16.1M
{
14403
16.1M
    *p = intern_common(interp, *p, 1);
14404
16.1M
    assert(*p);
14405
16.1M
}
14406
14407
void
14408
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
14409
50.4M
{
14410
50.4M
    *p = intern_common(interp, *p, 0);
14411
50.4M
    assert(*p);
14412
50.4M
}
14413
14414
14415
void
14416
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
14417
0
{
14418
0
    _PyUnicode_InternImmortal(interp, p);
14419
0
    return;
14420
0
}
14421
14422
void
14423
PyUnicode_InternInPlace(PyObject **p)
14424
0
{
14425
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14426
0
    _PyUnicode_InternMortal(interp, p);
14427
0
}
14428
14429
// Public-looking name kept for the stable ABI; user should not call this:
14430
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
14431
void
14432
PyUnicode_InternImmortal(PyObject **p)
14433
0
{
14434
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14435
0
    _PyUnicode_InternImmortal(interp, p);
14436
0
}
14437
14438
PyObject *
14439
PyUnicode_InternFromString(const char *cp)
14440
1.33M
{
14441
1.33M
    PyObject *s = PyUnicode_FromString(cp);
14442
1.33M
    if (s == NULL) {
14443
0
        return NULL;
14444
0
    }
14445
1.33M
    PyInterpreterState *interp = _PyInterpreterState_GET();
14446
1.33M
    _PyUnicode_InternMortal(interp, &s);
14447
1.33M
    return s;
14448
1.33M
}
14449
14450
14451
void
14452
_PyUnicode_ClearInterned(PyInterpreterState *interp)
14453
0
{
14454
0
    PyObject *interned = get_interned_dict(interp);
14455
0
    if (interned == NULL) {
14456
0
        return;
14457
0
    }
14458
0
    assert(PyDict_CheckExact(interned));
14459
14460
0
    if (has_shared_intern_dict(interp)) {
14461
        // the dict doesn't belong to this interpreter, skip the debug
14462
        // checks on it and just clear the pointer to it
14463
0
        clear_interned_dict(interp);
14464
0
        return;
14465
0
    }
14466
14467
#ifdef INTERNED_STATS
14468
    fprintf(stderr, "releasing %zd interned strings\n",
14469
            PyDict_GET_SIZE(interned));
14470
14471
    Py_ssize_t total_length = 0;
14472
#endif
14473
0
    Py_ssize_t pos = 0;
14474
0
    PyObject *s, *ignored_value;
14475
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
14476
0
        int shared = 0;
14477
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
14478
0
        case SSTATE_INTERNED_IMMORTAL:
14479
            /* Make immortal interned strings mortal again. */
14480
            // Skip the Immortal Instance check and restore
14481
            // the two references (key and value) ignored
14482
            // by PyUnicode_InternInPlace().
14483
0
            _Py_SetMortal(s, 2);
14484
#ifdef Py_REF_DEBUG
14485
            /* let's be pedantic with the ref total */
14486
            _Py_IncRefTotal(_PyThreadState_GET());
14487
            _Py_IncRefTotal(_PyThreadState_GET());
14488
#endif
14489
#ifdef INTERNED_STATS
14490
            total_length += PyUnicode_GET_LENGTH(s);
14491
#endif
14492
0
            break;
14493
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
14494
            /* It is shared between interpreters, so we should unmark it
14495
               only when this is the last interpreter in which it's
14496
               interned.  We immortalize all the statically initialized
14497
               strings during startup, so we can rely on the
14498
               main interpreter to be the last one. */
14499
0
            if (!_Py_IsMainInterpreter(interp)) {
14500
0
                shared = 1;
14501
0
            }
14502
0
            break;
14503
0
        case SSTATE_INTERNED_MORTAL:
14504
            // Restore 2 references held by the interned dict; these will
14505
            // be decref'd by clear_interned_dict's PyDict_Clear.
14506
0
            _Py_RefcntAdd(s, 2);
14507
#ifdef Py_REF_DEBUG
14508
            /* let's be pedantic with the ref total */
14509
            _Py_IncRefTotal(_PyThreadState_GET());
14510
            _Py_IncRefTotal(_PyThreadState_GET());
14511
#endif
14512
0
            break;
14513
0
        case SSTATE_NOT_INTERNED:
14514
0
            _Py_FALLTHROUGH;
14515
0
        default:
14516
0
            Py_UNREACHABLE();
14517
0
        }
14518
0
        if (!shared) {
14519
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
14520
0
        }
14521
0
    }
14522
#ifdef INTERNED_STATS
14523
    fprintf(stderr,
14524
            "total length of all interned strings: %zd characters\n",
14525
            total_length);
14526
#endif
14527
14528
0
    struct _Py_unicode_state *state = &interp->unicode;
14529
0
    struct _Py_unicode_ids *ids = &state->ids;
14530
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
14531
0
        Py_XINCREF(ids->array[i]);
14532
0
    }
14533
0
    clear_interned_dict(interp);
14534
0
    if (_Py_IsMainInterpreter(interp)) {
14535
0
        clear_global_interned_strings();
14536
0
    }
14537
0
}
14538
14539
14540
/********************* Unicode Iterator **************************/
14541
14542
typedef struct {
14543
    PyObject_HEAD
14544
    Py_ssize_t it_index;
14545
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14546
} unicodeiterobject;
14547
14548
static void
14549
unicodeiter_dealloc(PyObject *op)
14550
2.46M
{
14551
2.46M
    unicodeiterobject *it = (unicodeiterobject *)op;
14552
2.46M
    _PyObject_GC_UNTRACK(it);
14553
2.46M
    Py_XDECREF(it->it_seq);
14554
2.46M
    PyObject_GC_Del(it);
14555
2.46M
}
14556
14557
static int
14558
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
14559
14
{
14560
14
    unicodeiterobject *it = (unicodeiterobject *)op;
14561
14
    Py_VISIT(it->it_seq);
14562
14
    return 0;
14563
14
}
14564
14565
static PyObject *
14566
unicodeiter_next(PyObject *op)
14567
60.5M
{
14568
60.5M
    unicodeiterobject *it = (unicodeiterobject *)op;
14569
60.5M
    PyObject *seq;
14570
14571
60.5M
    assert(it != NULL);
14572
60.5M
    seq = it->it_seq;
14573
60.5M
    if (seq == NULL)
14574
0
        return NULL;
14575
60.5M
    assert(_PyUnicode_CHECK(seq));
14576
14577
60.5M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14578
59.2M
        int kind = PyUnicode_KIND(seq);
14579
59.2M
        const void *data = PyUnicode_DATA(seq);
14580
59.2M
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14581
59.2M
        it->it_index++;
14582
59.2M
        return unicode_char(chr);
14583
59.2M
    }
14584
14585
1.29M
    it->it_seq = NULL;
14586
1.29M
    Py_DECREF(seq);
14587
1.29M
    return NULL;
14588
60.5M
}
14589
14590
static PyObject *
14591
unicode_ascii_iter_next(PyObject *op)
14592
6.80M
{
14593
6.80M
    unicodeiterobject *it = (unicodeiterobject *)op;
14594
6.80M
    assert(it != NULL);
14595
6.80M
    PyObject *seq = it->it_seq;
14596
6.80M
    if (seq == NULL) {
14597
0
        return NULL;
14598
0
    }
14599
6.80M
    assert(_PyUnicode_CHECK(seq));
14600
6.80M
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
14601
6.80M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14602
5.64M
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
14603
5.64M
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
14604
5.64M
                                              data, it->it_index);
14605
5.64M
        it->it_index++;
14606
5.64M
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
14607
5.64M
    }
14608
1.15M
    it->it_seq = NULL;
14609
1.15M
    Py_DECREF(seq);
14610
1.15M
    return NULL;
14611
6.80M
}
14612
14613
static PyObject *
14614
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
14615
2.01M
{
14616
2.01M
    unicodeiterobject *it = (unicodeiterobject *)op;
14617
2.01M
    Py_ssize_t len = 0;
14618
2.01M
    if (it->it_seq)
14619
2.01M
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14620
2.01M
    return PyLong_FromSsize_t(len);
14621
2.01M
}
14622
14623
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14624
14625
static PyObject *
14626
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
14627
0
{
14628
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14629
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
14630
14631
    /* _PyEval_GetBuiltin can invoke arbitrary code,
14632
     * call must be before access of iterator pointers.
14633
     * see issue #101765 */
14634
14635
0
    if (it->it_seq != NULL) {
14636
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
14637
0
    } else {
14638
0
        PyObject *u = _PyUnicode_GetEmpty();
14639
0
        if (u == NULL) {
14640
0
            Py_XDECREF(iter);
14641
0
            return NULL;
14642
0
        }
14643
0
        return Py_BuildValue("N(N)", iter, u);
14644
0
    }
14645
0
}
14646
14647
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14648
14649
static PyObject *
14650
unicodeiter_setstate(PyObject *op, PyObject *state)
14651
0
{
14652
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14653
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
14654
0
    if (index == -1 && PyErr_Occurred())
14655
0
        return NULL;
14656
0
    if (it->it_seq != NULL) {
14657
0
        if (index < 0)
14658
0
            index = 0;
14659
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
14660
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
14661
0
        it->it_index = index;
14662
0
    }
14663
0
    Py_RETURN_NONE;
14664
0
}
14665
14666
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14667
14668
static PyMethodDef unicodeiter_methods[] = {
14669
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
14670
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
14671
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
14672
    {NULL,      NULL}       /* sentinel */
14673
};
14674
14675
PyTypeObject PyUnicodeIter_Type = {
14676
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14677
    "str_iterator",         /* tp_name */
14678
    sizeof(unicodeiterobject),      /* tp_basicsize */
14679
    0,                  /* tp_itemsize */
14680
    /* methods */
14681
    unicodeiter_dealloc,/* tp_dealloc */
14682
    0,                  /* tp_vectorcall_offset */
14683
    0,                  /* tp_getattr */
14684
    0,                  /* tp_setattr */
14685
    0,                  /* tp_as_async */
14686
    0,                  /* tp_repr */
14687
    0,                  /* tp_as_number */
14688
    0,                  /* tp_as_sequence */
14689
    0,                  /* tp_as_mapping */
14690
    0,                  /* tp_hash */
14691
    0,                  /* tp_call */
14692
    0,                  /* tp_str */
14693
    PyObject_GenericGetAttr,        /* tp_getattro */
14694
    0,                  /* tp_setattro */
14695
    0,                  /* tp_as_buffer */
14696
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14697
    0,                  /* tp_doc */
14698
    unicodeiter_traverse, /* tp_traverse */
14699
    0,                  /* tp_clear */
14700
    0,                  /* tp_richcompare */
14701
    0,                  /* tp_weaklistoffset */
14702
    PyObject_SelfIter,          /* tp_iter */
14703
    unicodeiter_next,   /* tp_iternext */
14704
    unicodeiter_methods,            /* tp_methods */
14705
    0,
14706
};
14707
14708
PyTypeObject _PyUnicodeASCIIIter_Type = {
14709
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14710
    .tp_name = "str_ascii_iterator",
14711
    .tp_basicsize = sizeof(unicodeiterobject),
14712
    .tp_dealloc = unicodeiter_dealloc,
14713
    .tp_getattro = PyObject_GenericGetAttr,
14714
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
14715
    .tp_traverse = unicodeiter_traverse,
14716
    .tp_iter = PyObject_SelfIter,
14717
    .tp_iternext = unicode_ascii_iter_next,
14718
    .tp_methods = unicodeiter_methods,
14719
};
14720
14721
static PyObject *
14722
unicode_iter(PyObject *seq)
14723
2.46M
{
14724
2.46M
    unicodeiterobject *it;
14725
14726
2.46M
    if (!PyUnicode_Check(seq)) {
14727
0
        PyErr_BadInternalCall();
14728
0
        return NULL;
14729
0
    }
14730
2.46M
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
14731
1.17M
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
14732
1.17M
    }
14733
1.29M
    else {
14734
1.29M
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14735
1.29M
    }
14736
2.46M
    if (it == NULL)
14737
0
        return NULL;
14738
2.46M
    it->it_index = 0;
14739
2.46M
    it->it_seq = Py_NewRef(seq);
14740
2.46M
    _PyObject_GC_TRACK(it);
14741
2.46M
    return (PyObject *)it;
14742
2.46M
}
14743
14744
static int
14745
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
14746
144
{
14747
144
    int res;
14748
144
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
14749
144
    if (res == -2) {
14750
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
14751
0
        return -1;
14752
0
    }
14753
144
    if (res < 0) {
14754
0
        PyErr_NoMemory();
14755
0
        return -1;
14756
0
    }
14757
144
    return 0;
14758
144
}
14759
14760
14761
static int
14762
config_get_codec_name(wchar_t **config_encoding)
14763
72
{
14764
72
    char *encoding;
14765
72
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
14766
0
        return -1;
14767
0
    }
14768
14769
72
    PyObject *name_obj = NULL;
14770
72
    PyObject *codec = _PyCodec_Lookup(encoding);
14771
72
    PyMem_RawFree(encoding);
14772
14773
72
    if (!codec)
14774
0
        goto error;
14775
14776
72
    name_obj = PyObject_GetAttrString(codec, "name");
14777
72
    Py_CLEAR(codec);
14778
72
    if (!name_obj) {
14779
0
        goto error;
14780
0
    }
14781
14782
72
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
14783
72
    Py_DECREF(name_obj);
14784
72
    if (wname == NULL) {
14785
0
        goto error;
14786
0
    }
14787
14788
72
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
14789
72
    if (raw_wname == NULL) {
14790
0
        PyMem_Free(wname);
14791
0
        PyErr_NoMemory();
14792
0
        goto error;
14793
0
    }
14794
14795
72
    PyMem_RawFree(*config_encoding);
14796
72
    *config_encoding = raw_wname;
14797
14798
72
    PyMem_Free(wname);
14799
72
    return 0;
14800
14801
0
error:
14802
0
    Py_XDECREF(codec);
14803
0
    Py_XDECREF(name_obj);
14804
0
    return -1;
14805
72
}
14806
14807
14808
static PyStatus
14809
init_stdio_encoding(PyInterpreterState *interp)
14810
36
{
14811
    /* Update the stdio encoding to the normalized Python codec name. */
14812
36
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14813
36
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
14814
0
        return _PyStatus_ERR("failed to get the Python codec name "
14815
0
                             "of the stdio encoding");
14816
0
    }
14817
36
    return _PyStatus_OK();
14818
36
}
14819
14820
14821
static int
14822
init_fs_codec(PyInterpreterState *interp)
14823
36
{
14824
36
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
14825
14826
36
    _Py_error_handler error_handler;
14827
36
    error_handler = get_error_handler_wide(config->filesystem_errors);
14828
36
    if (error_handler == _Py_ERROR_UNKNOWN) {
14829
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
14830
0
        return -1;
14831
0
    }
14832
14833
36
    char *encoding, *errors;
14834
36
    if (encode_wstr_utf8(config->filesystem_encoding,
14835
36
                         &encoding,
14836
36
                         "filesystem_encoding") < 0) {
14837
0
        return -1;
14838
0
    }
14839
14840
36
    if (encode_wstr_utf8(config->filesystem_errors,
14841
36
                         &errors,
14842
36
                         "filesystem_errors") < 0) {
14843
0
        PyMem_RawFree(encoding);
14844
0
        return -1;
14845
0
    }
14846
14847
36
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
14848
36
    PyMem_RawFree(fs_codec->encoding);
14849
36
    fs_codec->encoding = encoding;
14850
    /* encoding has been normalized by init_fs_encoding() */
14851
36
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
14852
36
    PyMem_RawFree(fs_codec->errors);
14853
36
    fs_codec->errors = errors;
14854
36
    fs_codec->error_handler = error_handler;
14855
14856
#ifdef _Py_FORCE_UTF8_FS_ENCODING
14857
    assert(fs_codec->utf8 == 1);
14858
#endif
14859
14860
    /* At this point, PyUnicode_EncodeFSDefault() and
14861
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
14862
       the C implementation of the filesystem encoding. */
14863
14864
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
14865
       global configuration variables. */
14866
36
    if (_Py_IsMainInterpreter(interp)) {
14867
14868
36
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
14869
36
                                      fs_codec->errors) < 0) {
14870
0
            PyErr_NoMemory();
14871
0
            return -1;
14872
0
        }
14873
36
    }
14874
36
    return 0;
14875
36
}
14876
14877
14878
static PyStatus
14879
init_fs_encoding(PyThreadState *tstate)
14880
36
{
14881
36
    PyInterpreterState *interp = tstate->interp;
14882
14883
    /* Update the filesystem encoding to the normalized Python codec name.
14884
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
14885
       (Python codec name). */
14886
36
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14887
36
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
14888
0
        _Py_DumpPathConfig(tstate);
14889
0
        return _PyStatus_ERR("failed to get the Python codec "
14890
0
                             "of the filesystem encoding");
14891
0
    }
14892
14893
36
    if (init_fs_codec(interp) < 0) {
14894
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
14895
0
    }
14896
36
    return _PyStatus_OK();
14897
36
}
14898
14899
14900
PyStatus
14901
_PyUnicode_InitEncodings(PyThreadState *tstate)
14902
36
{
14903
36
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
14904
36
    if (_PyStatus_EXCEPTION(status)) {
14905
0
        return status;
14906
0
    }
14907
36
    status = init_fs_encoding(tstate);
14908
36
    if (_PyStatus_EXCEPTION(status)) {
14909
0
        return status;
14910
0
    }
14911
14912
36
    return init_stdio_encoding(tstate->interp);
14913
36
}
14914
14915
14916
static void
14917
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
14918
0
{
14919
0
    PyMem_RawFree(fs_codec->encoding);
14920
0
    fs_codec->encoding = NULL;
14921
0
    fs_codec->utf8 = 0;
14922
0
    PyMem_RawFree(fs_codec->errors);
14923
0
    fs_codec->errors = NULL;
14924
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
14925
0
}
14926
14927
14928
#ifdef Py_DEBUG
14929
static inline int
14930
unicode_is_finalizing(void)
14931
{
14932
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
14933
}
14934
#endif
14935
14936
14937
void
14938
_PyUnicode_FiniTypes(PyInterpreterState *interp)
14939
0
{
14940
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
14941
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
14942
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
14943
0
}
14944
14945
14946
void
14947
_PyUnicode_Fini(PyInterpreterState *interp)
14948
0
{
14949
0
    struct _Py_unicode_state *state = &interp->unicode;
14950
14951
0
    if (!has_shared_intern_dict(interp)) {
14952
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
14953
0
        assert(get_interned_dict(interp) == NULL);
14954
0
    }
14955
14956
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
14957
14958
    // bpo-47182: force a unicodedata CAPI capsule re-import on
14959
    // subsequent initialization of interpreter.
14960
0
    interp->unicode.ucnhash_capi = NULL;
14961
14962
0
    unicode_clear_identifiers(state);
14963
0
}
14964
14965
/* A _string module, to export formatter_parser and formatter_field_name_split
14966
   to the string.Formatter class implemented in Python. */
14967
14968
static PyMethodDef _string_methods[] = {
14969
    {"formatter_field_name_split", formatter_field_name_split,
14970
     METH_O, PyDoc_STR("split the argument as a field name")},
14971
    {"formatter_parser", formatter_parser,
14972
     METH_O, PyDoc_STR("parse the argument as a format string")},
14973
    {NULL, NULL}
14974
};
14975
14976
static PyModuleDef_Slot module_slots[] = {
14977
    _Py_ABI_SLOT,
14978
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
14979
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
14980
    {0, NULL}
14981
};
14982
14983
static struct PyModuleDef _string_module = {
14984
    PyModuleDef_HEAD_INIT,
14985
    .m_name = "_string",
14986
    .m_doc = PyDoc_STR("string helper module"),
14987
    .m_size = 0,
14988
    .m_methods = _string_methods,
14989
    .m_slots = module_slots,
14990
};
14991
14992
PyMODINIT_FUNC
14993
PyInit__string(void)
14994
8
{
14995
8
    return PyModuleDef_Init(&_string_module);
14996
8
}
14997
14998
14999
#undef PyUnicode_KIND
15000
int PyUnicode_KIND(PyObject *op)
15001
0
{
15002
0
    if (!PyUnicode_Check(op)) {
15003
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
15004
0
        return -1;
15005
0
    }
15006
0
    return _PyASCIIObject_CAST(op)->state.kind;
15007
0
}
15008
15009
#undef PyUnicode_DATA
15010
void* PyUnicode_DATA(PyObject *op)
15011
0
{
15012
0
    if (!PyUnicode_Check(op)) {
15013
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
15014
0
        return NULL;
15015
0
    }
15016
0
    return _PyUnicode_DATA(op);
15017
0
}