Coverage Report

Created: 2026-05-16 06:46

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Objects/unicodeobject.c
Line
Count
Source
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_RepeatBuffer()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_initconfig.h"    // _PyStatus_OK()
50
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
51
#include "pycore_long.h"          // _PyLong_FormatWriter()
52
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
53
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
54
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
55
#include "pycore_pyhash.h"        // _Py_HashSecret_t
56
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
57
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
58
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
59
#include "pycore_unicodectype.h"  // _PyUnicode_IsXidStart
60
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
61
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
62
63
#include "stringlib/eq.h"         // unicode_eq()
64
#include <stddef.h>               // ptrdiff_t
65
66
#ifdef MS_WINDOWS
67
#include <windows.h>
68
#endif
69
70
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
71
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
72
#endif
73
74
/* Uncomment to display statistics on interned strings at exit
75
   in _PyUnicode_ClearInterned(). */
76
/* #define INTERNED_STATS 1 */
77
78
79
/*[clinic input]
80
class str "PyObject *" "&PyUnicode_Type"
81
[clinic start generated code]*/
82
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
83
84
/*[python input]
85
class Py_UCS4_converter(CConverter):
86
    type = 'Py_UCS4'
87
    converter = 'convert_uc'
88
89
    def c_default_init(self):
90
        import libclinic
91
        self.c_default = libclinic.c_unichar_repr(self.default)
92
93
[python start generated code]*/
94
/*[python end generated code: output=da39a3ee5e6b4b0d input=22f057b68fd9a65a]*/
95
96
/* --- Globals ------------------------------------------------------------
97
98
NOTE: In the interpreter's initialization phase, some globals are currently
99
      initialized dynamically as needed. In the process Unicode objects may
100
      be created before the Unicode type is ready.
101
102
*/
103
104
24.4M
#define MAX_UNICODE _Py_MAX_UNICODE
105
258M
#define ensure_unicode _PyUnicode_EnsureUnicode
106
107
#ifdef Py_DEBUG
108
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
109
#else
110
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
111
#endif
112
113
static inline char* _PyUnicode_UTF8(PyObject *op)
114
259M
{
115
259M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
116
259M
}
117
118
static inline char* PyUnicode_UTF8(PyObject *op)
119
149M
{
120
149M
    assert(_PyUnicode_CHECK(op));
121
149M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
122
135M
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
123
135M
    }
124
14.2M
    else {
125
14.2M
         return _PyUnicode_UTF8(op);
126
14.2M
    }
127
149M
}
128
129
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
130
28.8M
{
131
28.8M
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
132
28.8M
}
133
134
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
135
70.1M
{
136
70.1M
    assert(_PyUnicode_CHECK(op));
137
70.1M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
138
67.4M
         return _PyASCIIObject_CAST(op)->length;
139
67.4M
    }
140
2.72M
    else {
141
2.72M
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
142
2.72M
    }
143
70.1M
}
144
145
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
146
28.8M
{
147
28.8M
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
148
28.8M
}
149
150
#define _PyUnicode_LENGTH(op)                           \
151
610M
    (_PyASCIIObject_CAST(op)->length)
152
#define _PyUnicode_STATE(op)                            \
153
3.87G
    (_PyASCIIObject_CAST(op)->state)
154
#define _PyUnicode_HASH(op)                             \
155
566M
    (_PyASCIIObject_CAST(op)->hash)
156
157
180M
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
158
159
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
160
48.2M
{
161
48.2M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
162
48.2M
}
163
164
#define _PyUnicode_DATA_ANY(op)                         \
165
62.3M
    (_PyUnicodeObject_CAST(op)->data.any)
166
167
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
168
0
{
169
0
    assert(_PyUnicode_CHECK(op));
170
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
171
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
172
0
}
173
174
/* true if the Unicode object has an allocated UTF-8 memory block
175
   (not shared with other data) */
176
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
177
608M
{
178
608M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
179
231M
            && _PyUnicode_UTF8(op) != NULL
180
13.2M
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
181
608M
}
182
183
184
224M
#define LATIN1 _Py_LATIN1_CHR
185
186
/* Forward declaration */
187
static PyObject *
188
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
189
                    const char *errors);
190
static PyObject *
191
unicode_decode_utf8(const char *s, Py_ssize_t size,
192
                    _Py_error_handler error_handler, const char *errors,
193
                    Py_ssize_t *consumed);
194
#ifdef Py_DEBUG
195
static inline int unicode_is_finalizing(void);
196
static int unicode_is_singleton(PyObject *unicode);
197
#endif
198
199
200
// Return a reference to the immortal empty string singleton.
201
PyObject*
202
_PyUnicode_GetEmpty(void)
203
109M
{
204
109M
    _Py_DECLARE_STR(empty, "");
205
109M
    return &_Py_STR(empty);
206
109M
}
207
208
/* This dictionary holds per-interpreter interned strings.
209
 * See InternalDocs/string_interning.md for details.
210
 */
211
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
212
5.97M
{
213
5.97M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
214
5.97M
}
215
216
/* This hashtable holds statically allocated interned strings.
217
 * See InternalDocs/string_interning.md for details.
218
 */
219
6.12M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
220
221
/* Get number of all interned strings for the current interpreter. */
222
Py_ssize_t
223
_PyUnicode_InternedSize(void)
224
0
{
225
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
226
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
227
0
}
228
229
/* Get number of immortal interned strings for the current interpreter. */
230
Py_ssize_t
231
_PyUnicode_InternedSize_Immortal(void)
232
0
{
233
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
234
0
    PyObject *key, *value;
235
0
    Py_ssize_t pos = 0;
236
0
    Py_ssize_t count = 0;
237
238
    // It's tempting to keep a count and avoid a loop here. But, this function
239
    // is intended for refleak tests. It spends extra work to report the true
240
    // value, to help detect bugs in optimizations.
241
242
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
243
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
244
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
245
0
           count++;
246
0
       }
247
0
    }
248
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
249
0
}
250
251
static Py_hash_t unicode_hash(PyObject *);
252
253
static Py_uhash_t
254
hashtable_unicode_hash(const void *key)
255
6.12M
{
256
6.12M
    return unicode_hash((PyObject *)key);
257
6.12M
}
258
259
static int
260
hashtable_unicode_compare(const void *key1, const void *key2)
261
531k
{
262
531k
    PyObject *obj1 = (PyObject *)key1;
263
531k
    PyObject *obj2 = (PyObject *)key2;
264
531k
    if (obj1 != NULL && obj2 != NULL) {
265
531k
        return unicode_eq(obj1, obj2);
266
531k
    }
267
0
    else {
268
0
        return obj1 == obj2;
269
0
    }
270
531k
}
271
272
/* Return true if this interpreter should share the main interpreter's
273
   intern_dict.  That's important for interpreters which load basic
274
   single-phase init extension modules (m_size == -1).  There could be interned
275
   immortal strings that are shared between interpreters, due to the
276
   PyDict_Update(mdict, m_copy) call in import_find_extension().
277
278
   It's not safe to deallocate those strings until all interpreters that
279
   potentially use them are freed.  By storing them in the main interpreter, we
280
   ensure they get freed after all other interpreters are freed.
281
*/
282
static bool
283
has_shared_intern_dict(PyInterpreterState *interp)
284
37
{
285
37
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
286
37
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
287
37
}
288
289
static int
290
init_interned_dict(PyInterpreterState *interp)
291
37
{
292
37
    assert(get_interned_dict(interp) == NULL);
293
37
    PyObject *interned;
294
37
    if (has_shared_intern_dict(interp)) {
295
0
        interned = get_interned_dict(_PyInterpreterState_Main());
296
0
        Py_INCREF(interned);
297
0
    }
298
37
    else {
299
37
        interned = PyDict_New();
300
37
        if (interned == NULL) {
301
0
            return -1;
302
0
        }
303
37
    }
304
37
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
305
37
    return 0;
306
37
}
307
308
static void
309
clear_interned_dict(PyInterpreterState *interp)
310
0
{
311
0
    PyObject *interned = get_interned_dict(interp);
312
0
    if (interned != NULL) {
313
0
        if (!has_shared_intern_dict(interp)) {
314
            // only clear if the dict belongs to this interpreter
315
0
            PyDict_Clear(interned);
316
0
        }
317
0
        Py_DECREF(interned);
318
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
319
0
    }
320
0
}
321
322
static PyStatus
323
init_global_interned_strings(PyInterpreterState *interp)
324
37
{
325
37
    assert(INTERNED_STRINGS == NULL);
326
37
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
327
328
37
    INTERNED_STRINGS = _Py_hashtable_new_full(
329
37
        hashtable_unicode_hash,
330
37
        hashtable_unicode_compare,
331
        // Objects stored here are immortal and statically allocated,
332
        // so we don't need key_destroy_func & value_destroy_func:
333
37
        NULL,
334
37
        NULL,
335
37
        &hashtable_alloc
336
37
    );
337
37
    if (INTERNED_STRINGS == NULL) {
338
0
        PyErr_Clear();
339
0
        return _PyStatus_ERR("failed to create global interned dict");
340
0
    }
341
342
    /* Intern statically allocated string identifiers, deepfreeze strings,
343
        * and one-byte latin-1 strings.
344
        * This must be done before any module initialization so that statically
345
        * allocated string identifiers are used instead of heap allocated strings.
346
        * Deepfreeze uses the interned identifiers if present to save space
347
        * else generates them and they are interned to speed up dict lookups.
348
    */
349
37
    _PyUnicode_InitStaticStrings(interp);
350
351
9.50k
    for (int i = 0; i < 256; i++) {
352
9.47k
        PyObject *s = LATIN1(i);
353
9.47k
        _PyUnicode_InternStatic(interp, &s);
354
9.47k
        assert(s == LATIN1(i));
355
9.47k
    }
356
#ifdef Py_DEBUG
357
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
358
359
    for (int i = 0; i < 256; i++) {
360
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
361
    }
362
#endif
363
37
    return _PyStatus_OK();
364
37
}
365
366
static void clear_global_interned_strings(void)
367
0
{
368
0
    if (INTERNED_STRINGS != NULL) {
369
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
370
0
        INTERNED_STRINGS = NULL;
371
0
    }
372
0
}
373
374
#define _Py_RETURN_UNICODE_EMPTY()   \
375
52.3M
    do {                             \
376
52.3M
        return _PyUnicode_GetEmpty();\
377
52.3M
    } while (0)
378
379
380
/* Fast detection of the most frequent whitespace characters */
381
const unsigned char _Py_ascii_whitespace[] = {
382
    0, 0, 0, 0, 0, 0, 0, 0,
383
/*     case 0x0009: * CHARACTER TABULATION */
384
/*     case 0x000A: * LINE FEED */
385
/*     case 0x000B: * LINE TABULATION */
386
/*     case 0x000C: * FORM FEED */
387
/*     case 0x000D: * CARRIAGE RETURN */
388
    0, 1, 1, 1, 1, 1, 0, 0,
389
    0, 0, 0, 0, 0, 0, 0, 0,
390
/*     case 0x001C: * FILE SEPARATOR */
391
/*     case 0x001D: * GROUP SEPARATOR */
392
/*     case 0x001E: * RECORD SEPARATOR */
393
/*     case 0x001F: * UNIT SEPARATOR */
394
    0, 0, 0, 0, 1, 1, 1, 1,
395
/*     case 0x0020: * SPACE */
396
    1, 0, 0, 0, 0, 0, 0, 0,
397
    0, 0, 0, 0, 0, 0, 0, 0,
398
    0, 0, 0, 0, 0, 0, 0, 0,
399
    0, 0, 0, 0, 0, 0, 0, 0,
400
401
    0, 0, 0, 0, 0, 0, 0, 0,
402
    0, 0, 0, 0, 0, 0, 0, 0,
403
    0, 0, 0, 0, 0, 0, 0, 0,
404
    0, 0, 0, 0, 0, 0, 0, 0,
405
    0, 0, 0, 0, 0, 0, 0, 0,
406
    0, 0, 0, 0, 0, 0, 0, 0,
407
    0, 0, 0, 0, 0, 0, 0, 0,
408
    0, 0, 0, 0, 0, 0, 0, 0
409
};
410
411
/* forward */
412
static PyObject* get_latin1_char(unsigned char ch);
413
414
415
static PyObject *
416
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
417
static PyObject *
418
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
419
static PyObject *
420
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
421
422
static PyObject *
423
unicode_encode_call_errorhandler(const char *errors,
424
       PyObject **errorHandler,const char *encoding, const char *reason,
425
       PyObject *unicode, PyObject **exceptionObject,
426
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
427
428
static void
429
raise_encode_exception(PyObject **exceptionObject,
430
                       const char *encoding,
431
                       PyObject *unicode,
432
                       Py_ssize_t startpos, Py_ssize_t endpos,
433
                       const char *reason);
434
435
/* Same for linebreaks */
436
static const unsigned char ascii_linebreak[] = {
437
    0, 0, 0, 0, 0, 0, 0, 0,
438
/*         0x000A, * LINE FEED */
439
/*         0x000B, * LINE TABULATION */
440
/*         0x000C, * FORM FEED */
441
/*         0x000D, * CARRIAGE RETURN */
442
    0, 0, 1, 1, 1, 1, 0, 0,
443
    0, 0, 0, 0, 0, 0, 0, 0,
444
/*         0x001C, * FILE SEPARATOR */
445
/*         0x001D, * GROUP SEPARATOR */
446
/*         0x001E, * RECORD SEPARATOR */
447
    0, 0, 0, 0, 1, 1, 1, 0,
448
    0, 0, 0, 0, 0, 0, 0, 0,
449
    0, 0, 0, 0, 0, 0, 0, 0,
450
    0, 0, 0, 0, 0, 0, 0, 0,
451
    0, 0, 0, 0, 0, 0, 0, 0,
452
453
    0, 0, 0, 0, 0, 0, 0, 0,
454
    0, 0, 0, 0, 0, 0, 0, 0,
455
    0, 0, 0, 0, 0, 0, 0, 0,
456
    0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0,
458
    0, 0, 0, 0, 0, 0, 0, 0,
459
    0, 0, 0, 0, 0, 0, 0, 0,
460
    0, 0, 0, 0, 0, 0, 0, 0
461
};
462
463
static int convert_uc(PyObject *obj, void *addr);
464
465
struct encoding_map;
466
#include "clinic/unicodeobject.c.h"
467
468
_Py_error_handler
469
_Py_GetErrorHandler(const char *errors)
470
3.63M
{
471
3.63M
    if (errors == NULL || strcmp(errors, "strict") == 0) {
472
2.90M
        return _Py_ERROR_STRICT;
473
2.90M
    }
474
730k
    if (strcmp(errors, "surrogateescape") == 0) {
475
519k
        return _Py_ERROR_SURROGATEESCAPE;
476
519k
    }
477
211k
    if (strcmp(errors, "replace") == 0) {
478
211k
        return _Py_ERROR_REPLACE;
479
211k
    }
480
4
    if (strcmp(errors, "ignore") == 0) {
481
0
        return _Py_ERROR_IGNORE;
482
0
    }
483
4
    if (strcmp(errors, "backslashreplace") == 0) {
484
0
        return _Py_ERROR_BACKSLASHREPLACE;
485
0
    }
486
4
    if (strcmp(errors, "surrogatepass") == 0) {
487
4
        return _Py_ERROR_SURROGATEPASS;
488
4
    }
489
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
490
0
        return _Py_ERROR_XMLCHARREFREPLACE;
491
0
    }
492
0
    return _Py_ERROR_OTHER;
493
0
}
494
495
496
static _Py_error_handler
497
get_error_handler_wide(const wchar_t *errors)
498
74
{
499
74
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
500
0
        return _Py_ERROR_STRICT;
501
0
    }
502
74
    if (wcscmp(errors, L"surrogateescape") == 0) {
503
74
        return _Py_ERROR_SURROGATEESCAPE;
504
74
    }
505
0
    if (wcscmp(errors, L"replace") == 0) {
506
0
        return _Py_ERROR_REPLACE;
507
0
    }
508
0
    if (wcscmp(errors, L"ignore") == 0) {
509
0
        return _Py_ERROR_IGNORE;
510
0
    }
511
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
512
0
        return _Py_ERROR_BACKSLASHREPLACE;
513
0
    }
514
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
515
0
        return _Py_ERROR_SURROGATEPASS;
516
0
    }
517
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
518
0
        return _Py_ERROR_XMLCHARREFREPLACE;
519
0
    }
520
0
    return _Py_ERROR_OTHER;
521
0
}
522
523
524
static inline int
525
unicode_check_encoding_errors(const char *encoding, const char *errors)
526
40.0M
{
527
40.0M
    if (encoding == NULL && errors == NULL) {
528
11.9M
        return 0;
529
11.9M
    }
530
531
28.1M
    PyInterpreterState *interp = _PyInterpreterState_GET();
532
28.1M
#ifndef Py_DEBUG
533
    /* In release mode, only check in development mode (-X dev) */
534
28.1M
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
535
28.1M
        return 0;
536
28.1M
    }
537
#else
538
    /* Always check in debug mode */
539
#endif
540
541
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
542
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
543
0
    if (!interp->unicode.fs_codec.encoding) {
544
0
        return 0;
545
0
    }
546
547
    /* Disable checks during Python finalization. For example, it allows to
548
     * call PyObject_Dump() during finalization for debugging purpose.
549
     */
550
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
551
0
        return 0;
552
0
    }
553
554
0
    if (encoding != NULL
555
        // Fast path for the most common built-in encodings. Even if the codec
556
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
557
        // create a temporary Unicode string (the key in the cache).
558
0
        && strcmp(encoding, "utf-8") != 0
559
0
        && strcmp(encoding, "utf8") != 0
560
0
        && strcmp(encoding, "ascii") != 0)
561
0
    {
562
0
        PyObject *handler = _PyCodec_Lookup(encoding);
563
0
        if (handler == NULL) {
564
0
            return -1;
565
0
        }
566
0
        Py_DECREF(handler);
567
0
    }
568
569
0
    if (errors != NULL
570
        // Fast path for the most common built-in error handlers.
571
0
        && strcmp(errors, "strict") != 0
572
0
        && strcmp(errors, "ignore") != 0
573
0
        && strcmp(errors, "replace") != 0
574
0
        && strcmp(errors, "surrogateescape") != 0
575
0
        && strcmp(errors, "surrogatepass") != 0)
576
0
    {
577
0
        PyObject *handler = PyCodec_LookupError(errors);
578
0
        if (handler == NULL) {
579
0
            return -1;
580
0
        }
581
0
        Py_DECREF(handler);
582
0
    }
583
0
    return 0;
584
0
}
585
586
587
int
588
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
589
0
{
590
0
#define CHECK(expr) \
591
0
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
592
#ifdef Py_GIL_DISABLED
593
# define CHECK_IF_GIL(expr) (void)(expr)
594
# define CHECK_IF_FT(expr) CHECK(expr)
595
#else
596
0
# define CHECK_IF_GIL(expr) CHECK(expr)
597
0
# define CHECK_IF_FT(expr) (void)(expr)
598
0
#endif
599
600
601
0
    assert(op != NULL);
602
0
    CHECK(PyUnicode_Check(op));
603
604
0
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
605
0
    int kind = ascii->state.kind;
606
607
0
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
608
0
        CHECK(kind == PyUnicode_1BYTE_KIND);
609
0
    }
610
0
    else {
611
0
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
612
0
        void *data;
613
614
0
        if (ascii->state.compact == 1) {
615
0
            data = compact + 1;
616
0
            CHECK(kind == PyUnicode_1BYTE_KIND
617
0
                                 || kind == PyUnicode_2BYTE_KIND
618
0
                                 || kind == PyUnicode_4BYTE_KIND);
619
0
            CHECK(ascii->state.ascii == 0);
620
0
            CHECK(_PyUnicode_UTF8(op) != data);
621
0
        }
622
0
        else {
623
0
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
624
625
0
            data = unicode->data.any;
626
0
            CHECK(kind == PyUnicode_1BYTE_KIND
627
0
                     || kind == PyUnicode_2BYTE_KIND
628
0
                     || kind == PyUnicode_4BYTE_KIND);
629
0
            CHECK(ascii->state.compact == 0);
630
0
            CHECK(data != NULL);
631
0
            if (ascii->state.ascii) {
632
0
                CHECK(_PyUnicode_UTF8(op) == data);
633
0
                CHECK(compact->utf8_length == ascii->length);
634
0
            }
635
0
            else {
636
0
                CHECK(_PyUnicode_UTF8(op) != data);
637
0
            }
638
0
        }
639
0
#ifndef Py_GIL_DISABLED
640
0
        if (_PyUnicode_UTF8(op) == NULL)
641
0
            CHECK(compact->utf8_length == 0);
642
0
#endif
643
0
    }
644
645
    /* check that the best kind is used: O(n) operation */
646
0
    if (check_content) {
647
0
        Py_ssize_t i;
648
0
        Py_UCS4 maxchar = 0;
649
0
        const void *data;
650
0
        Py_UCS4 ch;
651
652
0
        data = PyUnicode_DATA(ascii);
653
0
        for (i=0; i < ascii->length; i++)
654
0
        {
655
0
            ch = PyUnicode_READ(kind, data, i);
656
0
            if (ch > maxchar)
657
0
                maxchar = ch;
658
0
        }
659
0
        if (kind == PyUnicode_1BYTE_KIND) {
660
0
            if (ascii->state.ascii == 0) {
661
0
                CHECK(maxchar >= 128);
662
0
                CHECK(maxchar <= 255);
663
0
            }
664
0
            else
665
0
                CHECK(maxchar < 128);
666
0
        }
667
0
        else if (kind == PyUnicode_2BYTE_KIND) {
668
0
            CHECK(maxchar >= 0x100);
669
0
            CHECK(maxchar <= 0xFFFF);
670
0
        }
671
0
        else {
672
0
            CHECK(maxchar >= 0x10000);
673
0
            CHECK(maxchar <= MAX_UNICODE);
674
0
        }
675
0
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
676
0
    }
677
678
    /* Check interning state */
679
#ifdef Py_DEBUG
680
    // Note that we do not check `_Py_IsImmortal(op)` in the GIL-enabled build
681
    // since stable ABI extensions can make immortal strings mortal (but with a
682
    // high enough refcount).
683
    switch (PyUnicode_CHECK_INTERNED(op)) {
684
        case SSTATE_NOT_INTERNED:
685
            if (ascii->state.statically_allocated) {
686
                // This state is for two exceptions:
687
                // - strings are currently checked before they're interned
688
                // - the 256 one-latin1-character strings
689
                //   are static but use SSTATE_NOT_INTERNED
690
            }
691
            else {
692
                CHECK_IF_GIL(!_Py_IsImmortal(op));
693
            }
694
            break;
695
        case SSTATE_INTERNED_MORTAL:
696
            CHECK(!ascii->state.statically_allocated);
697
            CHECK_IF_GIL(!_Py_IsImmortal(op));
698
            break;
699
        case SSTATE_INTERNED_IMMORTAL:
700
            CHECK(!ascii->state.statically_allocated);
701
            CHECK_IF_FT(_Py_IsImmortal(op));
702
            break;
703
        case SSTATE_INTERNED_IMMORTAL_STATIC:
704
            CHECK(ascii->state.statically_allocated);
705
            CHECK_IF_FT(_Py_IsImmortal(op));
706
            break;
707
        default:
708
            Py_UNREACHABLE();
709
    }
710
#endif
711
712
0
    return 1;
713
714
0
#undef CHECK
715
0
}
716
717
PyObject*
718
_PyUnicode_Result(PyObject *unicode)
719
57.5M
{
720
57.5M
    assert(_PyUnicode_CHECK(unicode));
721
722
57.5M
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
723
57.5M
    if (length == 0) {
724
186
        PyObject *empty = _PyUnicode_GetEmpty();
725
186
        if (unicode != empty) {
726
0
            Py_DECREF(unicode);
727
0
        }
728
186
        return empty;
729
186
    }
730
731
57.5M
    if (length == 1) {
732
3.46M
        int kind = PyUnicode_KIND(unicode);
733
3.46M
        if (kind == PyUnicode_1BYTE_KIND) {
734
140k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
735
140k
            Py_UCS1 ch = data[0];
736
140k
            PyObject *latin1_char = LATIN1(ch);
737
140k
            if (unicode != latin1_char) {
738
134k
                Py_DECREF(unicode);
739
134k
            }
740
140k
            return latin1_char;
741
140k
        }
742
3.46M
    }
743
744
57.5M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
745
57.4M
    return unicode;
746
57.5M
}
747
1.60M
#define unicode_result _PyUnicode_Result
748
749
static PyObject*
750
unicode_result_unchanged(PyObject *unicode)
751
95.1M
{
752
95.1M
    if (PyUnicode_CheckExact(unicode)) {
753
92.1M
        return Py_NewRef(unicode);
754
92.1M
    }
755
2.99M
    else
756
        /* Subtype -- return genuine unicode string with the same value. */
757
2.99M
        return _PyUnicode_Copy(unicode);
758
95.1M
}
759
760
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
761
   ASCII, Latin1, UTF-8, etc. */
762
static char*
763
backslashreplace(PyBytesWriter *writer, char *str,
764
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
765
0
{
766
0
    Py_ssize_t size, i;
767
0
    Py_UCS4 ch;
768
0
    int kind;
769
0
    const void *data;
770
771
0
    kind = PyUnicode_KIND(unicode);
772
0
    data = PyUnicode_DATA(unicode);
773
774
0
    size = 0;
775
    /* determine replacement size */
776
0
    for (i = collstart; i < collend; ++i) {
777
0
        Py_ssize_t incr;
778
779
0
        ch = PyUnicode_READ(kind, data, i);
780
0
        if (ch < 0x100)
781
0
            incr = 2+2;
782
0
        else if (ch < 0x10000)
783
0
            incr = 2+4;
784
0
        else {
785
0
            assert(ch <= MAX_UNICODE);
786
0
            incr = 2+8;
787
0
        }
788
0
        if (size > PY_SSIZE_T_MAX - incr) {
789
0
            PyErr_SetString(PyExc_OverflowError,
790
0
                            "encoded result is too long for a Python string");
791
0
            return NULL;
792
0
        }
793
0
        size += incr;
794
0
    }
795
796
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
797
0
    if (str == NULL) {
798
0
        return NULL;
799
0
    }
800
801
    /* generate replacement */
802
0
    for (i = collstart; i < collend; ++i) {
803
0
        ch = PyUnicode_READ(kind, data, i);
804
0
        *str++ = '\\';
805
0
        if (ch >= 0x00010000) {
806
0
            *str++ = 'U';
807
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
808
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
809
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
810
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
811
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
812
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
813
0
        }
814
0
        else if (ch >= 0x100) {
815
0
            *str++ = 'u';
816
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
817
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
818
0
        }
819
0
        else
820
0
            *str++ = 'x';
821
0
        *str++ = Py_hexdigits[(ch>>4)&0xf];
822
0
        *str++ = Py_hexdigits[ch&0xf];
823
0
    }
824
0
    return str;
825
0
}
826
827
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
828
   ASCII, Latin1, UTF-8, etc. */
829
static char*
830
xmlcharrefreplace(PyBytesWriter *writer, char *str,
831
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
832
0
{
833
0
    Py_ssize_t size, i;
834
0
    Py_UCS4 ch;
835
0
    int kind;
836
0
    const void *data;
837
838
0
    kind = PyUnicode_KIND(unicode);
839
0
    data = PyUnicode_DATA(unicode);
840
841
0
    size = 0;
842
    /* determine replacement size */
843
0
    for (i = collstart; i < collend; ++i) {
844
0
        Py_ssize_t incr;
845
846
0
        ch = PyUnicode_READ(kind, data, i);
847
0
        if (ch < 10)
848
0
            incr = 2+1+1;
849
0
        else if (ch < 100)
850
0
            incr = 2+2+1;
851
0
        else if (ch < 1000)
852
0
            incr = 2+3+1;
853
0
        else if (ch < 10000)
854
0
            incr = 2+4+1;
855
0
        else if (ch < 100000)
856
0
            incr = 2+5+1;
857
0
        else if (ch < 1000000)
858
0
            incr = 2+6+1;
859
0
        else {
860
0
            assert(ch <= MAX_UNICODE);
861
0
            incr = 2+7+1;
862
0
        }
863
0
        if (size > PY_SSIZE_T_MAX - incr) {
864
0
            PyErr_SetString(PyExc_OverflowError,
865
0
                            "encoded result is too long for a Python string");
866
0
            return NULL;
867
0
        }
868
0
        size += incr;
869
0
    }
870
871
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
872
0
    if (str == NULL) {
873
0
        return NULL;
874
0
    }
875
876
    /* generate replacement */
877
0
    for (i = collstart; i < collend; ++i) {
878
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
879
0
        if (size < 0) {
880
0
            return NULL;
881
0
        }
882
0
        str += size;
883
0
    }
884
0
    return str;
885
0
}
886
887
/* --- Bloom Filters ----------------------------------------------------- */
888
889
/* stuff to implement simple "bloom filters" for Unicode characters.
890
   to keep things simple, we use a single bitmask, using the least 5
891
   bits from each unicode characters as the bit index. */
892
893
/* the linebreak mask is set up by _PyUnicode_Init() below */
894
895
#if LONG_BIT >= 128
896
#define BLOOM_WIDTH 128
897
#elif LONG_BIT >= 64
898
22.0M
#define BLOOM_WIDTH 64
899
#elif LONG_BIT >= 32
900
#define BLOOM_WIDTH 32
901
#else
902
#error "LONG_BIT is smaller than 32"
903
#endif
904
905
8.90M
#define BLOOM_MASK unsigned long
906
907
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
908
909
27.9M
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
910
911
#define BLOOM_LINEBREAK(ch)                                             \
912
137M
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
913
137M
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
914
915
static inline BLOOM_MASK
916
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
917
4.45M
{
918
4.45M
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
919
4.45M
    do {                                               \
920
4.45M
        TYPE *data = (TYPE *)PTR;                      \
921
4.45M
        TYPE *end = data + LEN;                        \
922
4.45M
        Py_UCS4 ch;                                    \
923
10.5M
        for (; data != end; data++) {                  \
924
6.14M
            ch = *data;                                \
925
6.14M
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
926
6.14M
        }                                              \
927
4.45M
        break;                                         \
928
4.45M
    } while (0)
929
930
    /* calculate simple bloom-style bitmask for a given unicode string */
931
932
4.45M
    BLOOM_MASK mask;
933
934
4.45M
    mask = 0;
935
4.45M
    switch (kind) {
936
4.45M
    case PyUnicode_1BYTE_KIND:
937
4.45M
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
938
4.45M
        break;
939
37
    case PyUnicode_2BYTE_KIND:
940
37
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
941
37
        break;
942
0
    case PyUnicode_4BYTE_KIND:
943
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
944
0
        break;
945
0
    default:
946
0
        Py_UNREACHABLE();
947
4.45M
    }
948
4.45M
    return mask;
949
950
4.45M
#undef BLOOM_UPDATE
951
4.45M
}
952
953
/* Compilation of templated routines */
954
955
755k
#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
956
957
#include "stringlib/asciilib.h"
958
#include "stringlib/fastsearch.h"
959
#include "stringlib/partition.h"
960
#include "stringlib/split.h"
961
#include "stringlib/count.h"
962
#include "stringlib/find.h"
963
#include "stringlib/find_max_char.h"
964
#include "stringlib/undef.h"
965
966
#include "stringlib/ucs1lib.h"
967
#include "stringlib/fastsearch.h"
968
#include "stringlib/partition.h"
969
#include "stringlib/split.h"
970
#include "stringlib/count.h"
971
#include "stringlib/find.h"
972
#include "stringlib/replace.h"
973
#include "stringlib/repr.h"
974
#include "stringlib/find_max_char.h"
975
#include "stringlib/undef.h"
976
977
#include "stringlib/ucs2lib.h"
978
#include "stringlib/fastsearch.h"
979
#include "stringlib/partition.h"
980
#include "stringlib/split.h"
981
#include "stringlib/count.h"
982
#include "stringlib/find.h"
983
#include "stringlib/replace.h"
984
#include "stringlib/repr.h"
985
#include "stringlib/find_max_char.h"
986
#include "stringlib/undef.h"
987
988
#include "stringlib/ucs4lib.h"
989
#include "stringlib/fastsearch.h"
990
#include "stringlib/partition.h"
991
#include "stringlib/split.h"
992
#include "stringlib/count.h"
993
#include "stringlib/find.h"
994
#include "stringlib/replace.h"
995
#include "stringlib/repr.h"
996
#include "stringlib/find_max_char.h"
997
#include "stringlib/undef.h"
998
999
#undef STRINGLIB_GET_EMPTY
1000
1001
/* --- Unicode Object ----------------------------------------------------- */
1002
1003
static inline Py_ssize_t
1004
findchar(const void *s, int kind,
1005
         Py_ssize_t size, Py_UCS4 ch,
1006
         int direction)
1007
204M
{
1008
204M
    switch (kind) {
1009
197M
    case PyUnicode_1BYTE_KIND:
1010
197M
        if ((Py_UCS1) ch != ch)
1011
3.85k
            return -1;
1012
197M
        if (direction > 0)
1013
197M
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1014
70.8k
        else
1015
70.8k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1016
4.72M
    case PyUnicode_2BYTE_KIND:
1017
4.72M
        if ((Py_UCS2) ch != ch)
1018
0
            return -1;
1019
4.72M
        if (direction > 0)
1020
4.49M
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1021
232k
        else
1022
232k
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1023
2.66M
    case PyUnicode_4BYTE_KIND:
1024
2.66M
        if (direction > 0)
1025
2.53M
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1026
129k
        else
1027
129k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1028
0
    default:
1029
0
        Py_UNREACHABLE();
1030
204M
    }
1031
204M
}
1032
1033
#ifdef Py_DEBUG
1034
/* Fill the data of a Unicode string with invalid characters to detect bugs
1035
   earlier.
1036
1037
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1038
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1039
   invalid character in Unicode 6.0. */
1040
static void
1041
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1042
{
1043
    int kind = PyUnicode_KIND(unicode);
1044
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1045
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1046
    if (length <= old_length)
1047
        return;
1048
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1049
}
1050
#endif
1051
1052
static PyObject*
1053
resize_copy(PyObject *unicode, Py_ssize_t length)
1054
0
{
1055
0
    Py_ssize_t copy_length;
1056
0
    PyObject *copy;
1057
1058
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1059
0
    if (copy == NULL)
1060
0
        return NULL;
1061
1062
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1063
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1064
0
    return copy;
1065
0
}
1066
1067
PyObject*
1068
_PyUnicode_ResizeCompact(PyObject *unicode, Py_ssize_t length)
1069
58.8M
{
1070
58.8M
    Py_ssize_t char_size;
1071
58.8M
    Py_ssize_t struct_size;
1072
58.8M
    Py_ssize_t new_size;
1073
58.8M
    PyObject *new_unicode;
1074
#ifdef Py_DEBUG
1075
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1076
#endif
1077
1078
58.8M
    if (!_PyUnicode_IsModifiable(unicode)) {
1079
0
        PyObject *copy = resize_copy(unicode, length);
1080
0
        if (copy == NULL) {
1081
0
            return NULL;
1082
0
        }
1083
0
        Py_DECREF(unicode);
1084
0
        return copy;
1085
0
    }
1086
58.8M
    assert(PyUnicode_IS_COMPACT(unicode));
1087
1088
58.8M
    char_size = PyUnicode_KIND(unicode);
1089
58.8M
    if (PyUnicode_IS_ASCII(unicode))
1090
36.1M
        struct_size = sizeof(PyASCIIObject);
1091
22.7M
    else
1092
22.7M
        struct_size = sizeof(PyCompactUnicodeObject);
1093
1094
58.8M
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1095
0
        PyErr_NoMemory();
1096
0
        return NULL;
1097
0
    }
1098
58.8M
    new_size = (struct_size + (length + 1) * char_size);
1099
1100
58.8M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1101
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1102
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1103
0
        PyUnicode_SET_UTF8(unicode, NULL);
1104
0
    }
1105
#ifdef Py_TRACE_REFS
1106
    _Py_ForgetReference(unicode);
1107
#endif
1108
58.8M
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1109
1110
58.8M
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1111
58.8M
    if (new_unicode == NULL) {
1112
0
        _Py_NewReferenceNoTotal(unicode);
1113
0
        PyErr_NoMemory();
1114
0
        return NULL;
1115
0
    }
1116
58.8M
    unicode = new_unicode;
1117
58.8M
    _Py_NewReferenceNoTotal(unicode);
1118
1119
58.8M
    _PyUnicode_LENGTH(unicode) = length;
1120
#ifdef Py_DEBUG
1121
    unicode_fill_invalid(unicode, old_length);
1122
#endif
1123
58.8M
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1124
58.8M
                    length, 0);
1125
58.8M
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1126
58.8M
    return unicode;
1127
58.8M
}
1128
1129
static int
1130
resize_inplace(PyObject *unicode, Py_ssize_t length)
1131
0
{
1132
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1133
0
    assert(Py_REFCNT(unicode) == 1);
1134
1135
0
    Py_ssize_t new_size;
1136
0
    Py_ssize_t char_size;
1137
0
    int share_utf8;
1138
0
    void *data;
1139
#ifdef Py_DEBUG
1140
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1141
#endif
1142
1143
0
    data = _PyUnicode_DATA_ANY(unicode);
1144
0
    char_size = PyUnicode_KIND(unicode);
1145
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1146
1147
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1148
0
        PyErr_NoMemory();
1149
0
        return -1;
1150
0
    }
1151
0
    new_size = (length + 1) * char_size;
1152
1153
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1154
0
    {
1155
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1156
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1157
0
        PyUnicode_SET_UTF8(unicode, NULL);
1158
0
    }
1159
1160
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1161
0
    if (data == NULL) {
1162
0
        PyErr_NoMemory();
1163
0
        return -1;
1164
0
    }
1165
0
    _PyUnicode_DATA_ANY(unicode) = data;
1166
0
    if (share_utf8) {
1167
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1168
0
        PyUnicode_SET_UTF8(unicode, data);
1169
0
    }
1170
0
    _PyUnicode_LENGTH(unicode) = length;
1171
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1172
#ifdef Py_DEBUG
1173
    unicode_fill_invalid(unicode, old_length);
1174
#endif
1175
1176
    /* check for integer overflow */
1177
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1178
0
        PyErr_NoMemory();
1179
0
        return -1;
1180
0
    }
1181
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1182
0
    return 0;
1183
0
}
1184
1185
static const char*
1186
unicode_kind_name(PyObject *unicode)
1187
0
{
1188
    /* don't check consistency: unicode_kind_name() is called from
1189
       _PyUnicode_Dump() */
1190
0
    if (!PyUnicode_IS_COMPACT(unicode))
1191
0
    {
1192
0
        switch (PyUnicode_KIND(unicode))
1193
0
        {
1194
0
        case PyUnicode_1BYTE_KIND:
1195
0
            if (PyUnicode_IS_ASCII(unicode))
1196
0
                return "legacy ascii";
1197
0
            else
1198
0
                return "legacy latin1";
1199
0
        case PyUnicode_2BYTE_KIND:
1200
0
            return "legacy UCS2";
1201
0
        case PyUnicode_4BYTE_KIND:
1202
0
            return "legacy UCS4";
1203
0
        default:
1204
0
            return "<legacy invalid kind>";
1205
0
        }
1206
0
    }
1207
0
    switch (PyUnicode_KIND(unicode)) {
1208
0
    case PyUnicode_1BYTE_KIND:
1209
0
        if (PyUnicode_IS_ASCII(unicode))
1210
0
            return "ascii";
1211
0
        else
1212
0
            return "latin1";
1213
0
    case PyUnicode_2BYTE_KIND:
1214
0
        return "UCS2";
1215
0
    case PyUnicode_4BYTE_KIND:
1216
0
        return "UCS4";
1217
0
    default:
1218
0
        return "<invalid compact kind>";
1219
0
    }
1220
0
}
1221
1222
#ifdef Py_DEBUG
1223
/* Functions wrapping macros for use in debugger */
1224
const char *_PyUnicode_utf8(void *unicode_raw){
1225
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1226
    return PyUnicode_UTF8(unicode);
1227
}
1228
1229
const void *_PyUnicode_compact_data(void *unicode_raw) {
1230
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1231
    return _PyUnicode_COMPACT_DATA(unicode);
1232
}
1233
const void *_PyUnicode_data(void *unicode_raw) {
1234
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1235
    printf("obj %p\n", (void*)unicode);
1236
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1237
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1238
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1239
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1240
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1241
    return PyUnicode_DATA(unicode);
1242
}
1243
1244
void
1245
_PyUnicode_Dump(PyObject *op)
1246
{
1247
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1248
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1249
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1250
    const void *data;
1251
1252
    if (ascii->state.compact)
1253
    {
1254
        if (ascii->state.ascii)
1255
            data = (ascii + 1);
1256
        else
1257
            data = (compact + 1);
1258
    }
1259
    else
1260
        data = unicode->data.any;
1261
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1262
1263
    if (!ascii->state.ascii) {
1264
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1265
    }
1266
    printf(", data=%p\n", data);
1267
}
1268
#endif
1269
1270
1271
PyObject *
1272
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1273
560M
{
1274
    /* Optimization for empty strings */
1275
560M
    if (size == 0) {
1276
25.1M
        return _PyUnicode_GetEmpty();
1277
25.1M
    }
1278
1279
535M
    PyObject *obj;
1280
535M
    PyCompactUnicodeObject *unicode;
1281
535M
    void *data;
1282
535M
    int kind;
1283
535M
    int is_ascii;
1284
535M
    Py_ssize_t char_size;
1285
535M
    Py_ssize_t struct_size;
1286
1287
535M
    is_ascii = 0;
1288
535M
    struct_size = sizeof(PyCompactUnicodeObject);
1289
535M
    if (maxchar < 128) {
1290
342M
        kind = PyUnicode_1BYTE_KIND;
1291
342M
        char_size = 1;
1292
342M
        is_ascii = 1;
1293
342M
        struct_size = sizeof(PyASCIIObject);
1294
342M
    }
1295
193M
    else if (maxchar < 256) {
1296
13.7M
        kind = PyUnicode_1BYTE_KIND;
1297
13.7M
        char_size = 1;
1298
13.7M
    }
1299
179M
    else if (maxchar < 65536) {
1300
168M
        kind = PyUnicode_2BYTE_KIND;
1301
168M
        char_size = 2;
1302
168M
    }
1303
11.4M
    else {
1304
11.4M
        if (maxchar > MAX_UNICODE) {
1305
0
            PyErr_SetString(PyExc_SystemError,
1306
0
                            "invalid maximum character passed to PyUnicode_New");
1307
0
            return NULL;
1308
0
        }
1309
11.4M
        kind = PyUnicode_4BYTE_KIND;
1310
11.4M
        char_size = 4;
1311
11.4M
    }
1312
1313
    /* Ensure we won't overflow the size. */
1314
535M
    if (size < 0) {
1315
0
        PyErr_SetString(PyExc_SystemError,
1316
0
                        "Negative size passed to PyUnicode_New");
1317
0
        return NULL;
1318
0
    }
1319
535M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1320
0
        return PyErr_NoMemory();
1321
1322
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1323
     * PyObject_New() so we are able to allocate space for the object and
1324
     * it's data buffer.
1325
     */
1326
535M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1327
535M
    if (obj == NULL) {
1328
0
        return PyErr_NoMemory();
1329
0
    }
1330
535M
    _PyObject_Init(obj, &PyUnicode_Type);
1331
1332
535M
    unicode = (PyCompactUnicodeObject *)obj;
1333
535M
    if (is_ascii)
1334
342M
        data = ((PyASCIIObject*)obj) + 1;
1335
193M
    else
1336
193M
        data = unicode + 1;
1337
535M
    _PyUnicode_LENGTH(unicode) = size;
1338
535M
    _PyUnicode_HASH(unicode) = -1;
1339
535M
    _PyUnicode_STATE(unicode).interned = 0;
1340
535M
    _PyUnicode_STATE(unicode).kind = kind;
1341
535M
    _PyUnicode_STATE(unicode).compact = 1;
1342
535M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1343
535M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1344
535M
    if (is_ascii) {
1345
342M
        ((char*)data)[size] = 0;
1346
342M
    }
1347
193M
    else if (kind == PyUnicode_1BYTE_KIND) {
1348
13.7M
        ((char*)data)[size] = 0;
1349
13.7M
        unicode->utf8 = NULL;
1350
13.7M
        unicode->utf8_length = 0;
1351
13.7M
    }
1352
179M
    else {
1353
179M
        unicode->utf8 = NULL;
1354
179M
        unicode->utf8_length = 0;
1355
179M
        if (kind == PyUnicode_2BYTE_KIND)
1356
168M
            ((Py_UCS2*)data)[size] = 0;
1357
11.4M
        else /* kind == PyUnicode_4BYTE_KIND */
1358
11.4M
            ((Py_UCS4*)data)[size] = 0;
1359
179M
    }
1360
#ifdef Py_DEBUG
1361
    unicode_fill_invalid((PyObject*)unicode, 0);
1362
#endif
1363
535M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1364
535M
    return obj;
1365
535M
}
1366
1367
static int
1368
unicode_check_modifiable(PyObject *unicode)
1369
634
{
1370
634
    if (!_PyUnicode_IsModifiable(unicode)) {
1371
0
        PyErr_SetString(PyExc_SystemError,
1372
0
                        "Cannot modify a string currently used");
1373
0
        return -1;
1374
0
    }
1375
634
    return 0;
1376
634
}
1377
1378
static int
1379
_copy_characters(PyObject *to, Py_ssize_t to_start,
1380
                 PyObject *from, Py_ssize_t from_start,
1381
                 Py_ssize_t how_many, int check_maxchar)
1382
263M
{
1383
263M
    int from_kind, to_kind;
1384
263M
    const void *from_data;
1385
263M
    void *to_data;
1386
1387
263M
    assert(0 <= how_many);
1388
263M
    assert(0 <= from_start);
1389
263M
    assert(0 <= to_start);
1390
263M
    assert(PyUnicode_Check(from));
1391
263M
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1392
1393
263M
    assert(to == NULL || PyUnicode_Check(to));
1394
1395
263M
    if (how_many == 0) {
1396
5.64M
        return 0;
1397
5.64M
    }
1398
1399
263M
    assert(to != NULL);
1400
257M
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1401
1402
257M
    from_kind = PyUnicode_KIND(from);
1403
257M
    from_data = PyUnicode_DATA(from);
1404
257M
    to_kind = PyUnicode_KIND(to);
1405
257M
    to_data = PyUnicode_DATA(to);
1406
1407
#ifdef Py_DEBUG
1408
    if (!check_maxchar
1409
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1410
    {
1411
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1412
        Py_UCS4 ch;
1413
        Py_ssize_t i;
1414
        for (i=0; i < how_many; i++) {
1415
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1416
            assert(ch <= to_maxchar);
1417
        }
1418
    }
1419
#endif
1420
1421
257M
    if (from_kind == to_kind) {
1422
162M
        if (check_maxchar
1423
0
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1424
0
        {
1425
            /* Writing Latin-1 characters into an ASCII string requires to
1426
               check that all written characters are pure ASCII */
1427
0
            Py_UCS4 max_char;
1428
0
            max_char = ucs1lib_find_max_char(from_data,
1429
0
                                             (const Py_UCS1*)from_data + how_many);
1430
0
            if (max_char >= 128)
1431
0
                return -1;
1432
0
        }
1433
162M
        memcpy((char*)to_data + to_kind * to_start,
1434
162M
                  (const char*)from_data + from_kind * from_start,
1435
162M
                  to_kind * how_many);
1436
162M
    }
1437
94.9M
    else if (from_kind == PyUnicode_1BYTE_KIND
1438
91.9M
             && to_kind == PyUnicode_2BYTE_KIND)
1439
78.9M
    {
1440
78.9M
        _PyUnicode_CONVERT_BYTES(
1441
78.9M
            Py_UCS1, Py_UCS2,
1442
78.9M
            PyUnicode_1BYTE_DATA(from) + from_start,
1443
78.9M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1444
78.9M
            PyUnicode_2BYTE_DATA(to) + to_start
1445
78.9M
            );
1446
78.9M
    }
1447
15.9M
    else if (from_kind == PyUnicode_1BYTE_KIND
1448
12.9M
             && to_kind == PyUnicode_4BYTE_KIND)
1449
12.9M
    {
1450
12.9M
        _PyUnicode_CONVERT_BYTES(
1451
12.9M
            Py_UCS1, Py_UCS4,
1452
12.9M
            PyUnicode_1BYTE_DATA(from) + from_start,
1453
12.9M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1454
12.9M
            PyUnicode_4BYTE_DATA(to) + to_start
1455
12.9M
            );
1456
12.9M
    }
1457
3.01M
    else if (from_kind == PyUnicode_2BYTE_KIND
1458
3.00M
             && to_kind == PyUnicode_4BYTE_KIND)
1459
3.00M
    {
1460
3.00M
        _PyUnicode_CONVERT_BYTES(
1461
3.00M
            Py_UCS2, Py_UCS4,
1462
3.00M
            PyUnicode_2BYTE_DATA(from) + from_start,
1463
3.00M
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1464
3.00M
            PyUnicode_4BYTE_DATA(to) + to_start
1465
3.00M
            );
1466
3.00M
    }
1467
11.7k
    else {
1468
11.7k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1469
1470
11.7k
        if (!check_maxchar) {
1471
11.7k
            if (from_kind == PyUnicode_2BYTE_KIND
1472
2.62k
                && to_kind == PyUnicode_1BYTE_KIND)
1473
2.62k
            {
1474
2.62k
                _PyUnicode_CONVERT_BYTES(
1475
2.62k
                    Py_UCS2, Py_UCS1,
1476
2.62k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1477
2.62k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1478
2.62k
                    PyUnicode_1BYTE_DATA(to) + to_start
1479
2.62k
                    );
1480
2.62k
            }
1481
9.13k
            else if (from_kind == PyUnicode_4BYTE_KIND
1482
9.13k
                     && to_kind == PyUnicode_1BYTE_KIND)
1483
5.49k
            {
1484
5.49k
                _PyUnicode_CONVERT_BYTES(
1485
5.49k
                    Py_UCS4, Py_UCS1,
1486
5.49k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1487
5.49k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1488
5.49k
                    PyUnicode_1BYTE_DATA(to) + to_start
1489
5.49k
                    );
1490
5.49k
            }
1491
3.64k
            else if (from_kind == PyUnicode_4BYTE_KIND
1492
3.64k
                     && to_kind == PyUnicode_2BYTE_KIND)
1493
3.64k
            {
1494
3.64k
                _PyUnicode_CONVERT_BYTES(
1495
3.64k
                    Py_UCS4, Py_UCS2,
1496
3.64k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1497
3.64k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1498
3.64k
                    PyUnicode_2BYTE_DATA(to) + to_start
1499
3.64k
                    );
1500
3.64k
            }
1501
0
            else {
1502
0
                Py_UNREACHABLE();
1503
0
            }
1504
11.7k
        }
1505
0
        else {
1506
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1507
0
            Py_UCS4 ch;
1508
0
            Py_ssize_t i;
1509
1510
0
            for (i=0; i < how_many; i++) {
1511
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1512
0
                if (ch > to_maxchar)
1513
0
                    return -1;
1514
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1515
0
            }
1516
0
        }
1517
11.7k
    }
1518
257M
    return 0;
1519
257M
}
1520
1521
void
1522
_PyUnicode_FastCopyCharacters(
1523
    PyObject *to, Py_ssize_t to_start,
1524
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1525
263M
{
1526
263M
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1527
263M
}
1528
1529
Py_ssize_t
1530
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1531
                         PyObject *from, Py_ssize_t from_start,
1532
                         Py_ssize_t how_many)
1533
0
{
1534
0
    int err;
1535
1536
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1537
0
        PyErr_BadInternalCall();
1538
0
        return -1;
1539
0
    }
1540
1541
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1542
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1543
0
        return -1;
1544
0
    }
1545
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1546
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1547
0
        return -1;
1548
0
    }
1549
0
    if (how_many < 0) {
1550
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1551
0
        return -1;
1552
0
    }
1553
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1554
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1555
0
        PyErr_Format(PyExc_SystemError,
1556
0
                     "Cannot write %zi characters at %zi "
1557
0
                     "in a string of %zi characters",
1558
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1559
0
        return -1;
1560
0
    }
1561
1562
0
    if (how_many == 0)
1563
0
        return 0;
1564
1565
0
    if (unicode_check_modifiable(to))
1566
0
        return -1;
1567
1568
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1569
0
    if (err) {
1570
0
        PyErr_Format(PyExc_SystemError,
1571
0
                     "Cannot copy %s characters "
1572
0
                     "into a string of %s characters",
1573
0
                     unicode_kind_name(from),
1574
0
                     unicode_kind_name(to));
1575
0
        return -1;
1576
0
    }
1577
0
    return how_many;
1578
0
}
1579
1580
/* Find the maximum code point and count the number of surrogate pairs so a
1581
   correct string length can be computed before converting a string to UCS4.
1582
   This function counts single surrogates as a character and not as a pair.
1583
1584
   Return 0 on success, or -1 on error. */
1585
static int
1586
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1587
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1588
499k
{
1589
499k
    const wchar_t *iter;
1590
499k
    Py_UCS4 ch;
1591
1592
499k
    assert(num_surrogates != NULL && maxchar != NULL);
1593
499k
    *num_surrogates = 0;
1594
499k
    *maxchar = 0;
1595
1596
14.8M
    for (iter = begin; iter < end; ) {
1597
#if SIZEOF_WCHAR_T == 2
1598
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1599
            && (iter+1) < end
1600
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1601
        {
1602
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1603
            ++(*num_surrogates);
1604
            iter += 2;
1605
        }
1606
        else
1607
#endif
1608
14.3M
        {
1609
14.3M
            ch = *iter;
1610
14.3M
            iter++;
1611
14.3M
        }
1612
14.3M
        if (ch > *maxchar) {
1613
2.06M
            *maxchar = ch;
1614
2.06M
            if (*maxchar > MAX_UNICODE) {
1615
0
                PyErr_Format(PyExc_ValueError,
1616
0
                             "character U+%x is not in range [U+0000; U+%x]",
1617
0
                             ch, MAX_UNICODE);
1618
0
                return -1;
1619
0
            }
1620
2.06M
        }
1621
14.3M
    }
1622
499k
    return 0;
1623
499k
}
1624
1625
static void
1626
unicode_dealloc(PyObject *unicode)
1627
550M
{
1628
#ifdef Py_DEBUG
1629
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1630
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1631
    }
1632
#endif
1633
550M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1634
        /* This should never get called, but we also don't want to SEGV if
1635
        * we accidentally decref an immortal string out of existence. Since
1636
        * the string is an immortal object, just re-set the reference count.
1637
        */
1638
#ifdef Py_DEBUG
1639
        Py_UNREACHABLE();
1640
#endif
1641
0
        _Py_SetImmortal(unicode);
1642
0
        return;
1643
0
    }
1644
550M
    switch (_PyUnicode_STATE(unicode).interned) {
1645
549M
        case SSTATE_NOT_INTERNED:
1646
549M
            break;
1647
456k
        case SSTATE_INTERNED_MORTAL:
1648
            /* Remove the object from the intern dict.
1649
             * Before doing so, we set the refcount to 2: the key and value
1650
             * in the interned_dict.
1651
             */
1652
456k
            assert(Py_REFCNT(unicode) == 0);
1653
456k
            Py_SET_REFCNT(unicode, 2);
1654
#ifdef Py_REF_DEBUG
1655
            /* let's be pedantic with the ref total */
1656
            _Py_IncRefTotal(_PyThreadState_GET());
1657
            _Py_IncRefTotal(_PyThreadState_GET());
1658
#endif
1659
456k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1660
456k
            PyObject *interned = get_interned_dict(interp);
1661
456k
            assert(interned != NULL);
1662
456k
            PyObject *popped;
1663
456k
            int r = PyDict_Pop(interned, unicode, &popped);
1664
456k
            if (r == -1) {
1665
0
                PyErr_FormatUnraisable("Exception ignored while "
1666
0
                                       "removing an interned string %R",
1667
0
                                       unicode);
1668
                // We don't know what happened to the string. It's probably
1669
                // best to leak it:
1670
                // - if it was popped, there are no more references to it
1671
                //   so it can't cause trouble (except wasted memory)
1672
                // - if it wasn't popped, it'll remain interned
1673
0
                _Py_SetImmortal(unicode);
1674
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1675
0
                return;
1676
0
            }
1677
456k
            if (r == 0) {
1678
                // The interned string was not found in the interned_dict.
1679
#ifdef Py_DEBUG
1680
                Py_UNREACHABLE();
1681
#endif
1682
0
                _Py_SetImmortal(unicode);
1683
0
                return;
1684
0
            }
1685
            // Successfully popped.
1686
456k
            assert(popped == unicode);
1687
            // Only our `popped` reference should be left; remove it too.
1688
456k
            assert(Py_REFCNT(unicode) == 1);
1689
456k
            Py_SET_REFCNT(unicode, 0);
1690
#ifdef Py_REF_DEBUG
1691
            /* let's be pedantic with the ref total */
1692
            _Py_DecRefTotal(_PyThreadState_GET());
1693
#endif
1694
456k
            break;
1695
0
        default:
1696
            // As with `statically_allocated` above.
1697
#ifdef Py_REF_DEBUG
1698
            Py_UNREACHABLE();
1699
#endif
1700
0
            _Py_SetImmortal(unicode);
1701
0
            return;
1702
550M
    }
1703
550M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1704
150k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1705
150k
    }
1706
550M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1707
15.5M
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1708
15.5M
    }
1709
1710
550M
    Py_TYPE(unicode)->tp_free(unicode);
1711
550M
}
1712
1713
#ifdef Py_DEBUG
1714
static int
1715
unicode_is_singleton(PyObject *unicode)
1716
{
1717
    if (unicode == &_Py_STR(empty)) {
1718
        return 1;
1719
    }
1720
1721
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1722
    if (ascii->length == 1) {
1723
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1724
        if (ch < 256 && LATIN1(ch) == unicode) {
1725
            return 1;
1726
        }
1727
    }
1728
    return 0;
1729
}
1730
#endif
1731
1732
int
1733
_PyUnicode_IsModifiable(PyObject *unicode)
1734
65.9M
{
1735
65.9M
    assert(_PyUnicode_CHECK(unicode));
1736
65.9M
    if (!_PyObject_IsUniquelyReferenced(unicode))
1737
3.20M
        return 0;
1738
62.7M
    if (PyUnicode_HASH(unicode) != -1)
1739
0
        return 0;
1740
62.7M
    if (PyUnicode_CHECK_INTERNED(unicode))
1741
0
        return 0;
1742
62.7M
    if (!PyUnicode_CheckExact(unicode))
1743
0
        return 0;
1744
#ifdef Py_DEBUG
1745
    /* singleton refcount is greater than 1 */
1746
    assert(!unicode_is_singleton(unicode));
1747
#endif
1748
62.7M
    return 1;
1749
62.7M
}
1750
1751
static int
1752
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1753
1.91M
{
1754
1.91M
    PyObject *unicode;
1755
1.91M
    Py_ssize_t old_length;
1756
1757
1.91M
    assert(p_unicode != NULL);
1758
1.91M
    unicode = *p_unicode;
1759
1760
1.91M
    assert(unicode != NULL);
1761
1.91M
    assert(PyUnicode_Check(unicode));
1762
1.91M
    assert(0 <= length);
1763
1764
1.91M
    old_length = PyUnicode_GET_LENGTH(unicode);
1765
1.91M
    if (old_length == length)
1766
0
        return 0;
1767
1768
1.91M
    if (length == 0) {
1769
0
        PyObject *empty = _PyUnicode_GetEmpty();
1770
0
        Py_SETREF(*p_unicode, empty);
1771
0
        return 0;
1772
0
    }
1773
1774
1.91M
    if (!_PyUnicode_IsModifiable(unicode)) {
1775
0
        PyObject *copy = resize_copy(unicode, length);
1776
0
        if (copy == NULL)
1777
0
            return -1;
1778
0
        Py_SETREF(*p_unicode, copy);
1779
0
        return 0;
1780
0
    }
1781
1782
1.91M
    if (PyUnicode_IS_COMPACT(unicode)) {
1783
1.91M
        PyObject *new_unicode = _PyUnicode_ResizeCompact(unicode, length);
1784
1.91M
        if (new_unicode == NULL)
1785
0
            return -1;
1786
1.91M
        *p_unicode = new_unicode;
1787
1.91M
        return 0;
1788
1.91M
    }
1789
0
    return resize_inplace(unicode, length);
1790
1.91M
}
1791
1792
int
1793
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1794
0
{
1795
0
    PyObject *unicode;
1796
0
    if (p_unicode == NULL) {
1797
0
        PyErr_BadInternalCall();
1798
0
        return -1;
1799
0
    }
1800
0
    unicode = *p_unicode;
1801
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1802
0
    {
1803
0
        PyErr_BadInternalCall();
1804
0
        return -1;
1805
0
    }
1806
0
    return unicode_resize(p_unicode, length);
1807
0
}
1808
1809
static PyObject*
1810
get_latin1_char(Py_UCS1 ch)
1811
224M
{
1812
224M
    PyObject *o = LATIN1(ch);
1813
224M
    return o;
1814
224M
}
1815
1816
static PyObject*
1817
unicode_char(Py_UCS4 ch)
1818
260M
{
1819
260M
    PyObject *unicode;
1820
1821
260M
    assert(ch <= MAX_UNICODE);
1822
1823
260M
    if (ch < 256) {
1824
136M
        return get_latin1_char(ch);
1825
136M
    }
1826
1827
123M
    unicode = PyUnicode_New(1, ch);
1828
123M
    if (unicode == NULL)
1829
0
        return NULL;
1830
1831
123M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1832
123M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1833
114M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1834
114M
    } else {
1835
8.98M
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1836
8.98M
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1837
8.98M
    }
1838
123M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1839
123M
    return unicode;
1840
123M
}
1841
1842
1843
static inline void
1844
unicode_write_widechar(int kind, void *data,
1845
                       const wchar_t *u, Py_ssize_t size,
1846
                       Py_ssize_t num_surrogates)
1847
499k
{
1848
499k
    switch (kind) {
1849
464k
    case PyUnicode_1BYTE_KIND:
1850
464k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1851
464k
        break;
1852
1853
33.0k
    case PyUnicode_2BYTE_KIND:
1854
#if SIZEOF_WCHAR_T == 2
1855
        memcpy(data, u, size * 2);
1856
#else
1857
33.0k
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1858
33.0k
#endif
1859
33.0k
        break;
1860
1861
1.31k
    case PyUnicode_4BYTE_KIND:
1862
1.31k
    {
1863
#if SIZEOF_WCHAR_T == 2
1864
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
1865
        // surrogate pairs.
1866
        const wchar_t *end = u + size;
1867
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
1868
#  ifndef NDEBUG
1869
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
1870
#  endif
1871
        for (const wchar_t *iter = u; iter < end; ) {
1872
            assert(ucs4_out < ucs4_end);
1873
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1874
                && (iter+1) < end
1875
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1876
            {
1877
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1878
                iter += 2;
1879
            }
1880
            else {
1881
                *ucs4_out++ = *iter;
1882
                iter++;
1883
            }
1884
        }
1885
        assert(ucs4_out == ucs4_end);
1886
#else
1887
1.31k
        assert(num_surrogates == 0);
1888
1.31k
        memcpy(data, u, size * 4);
1889
1.31k
#endif
1890
1.31k
        break;
1891
0
    }
1892
0
    default:
1893
0
        Py_UNREACHABLE();
1894
499k
    }
1895
499k
}
1896
1897
1898
PyObject *
1899
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1900
896k
{
1901
896k
    PyObject *unicode;
1902
896k
    Py_UCS4 maxchar = 0;
1903
896k
    Py_ssize_t num_surrogates;
1904
1905
896k
    if (u == NULL && size != 0) {
1906
0
        PyErr_BadInternalCall();
1907
0
        return NULL;
1908
0
    }
1909
1910
896k
    if (size == -1) {
1911
1.33k
        size = wcslen(u);
1912
1.33k
    }
1913
1914
    /* If the Unicode data is known at construction time, we can apply
1915
       some optimizations which share commonly used objects. */
1916
1917
    /* Optimization for empty strings */
1918
896k
    if (size == 0)
1919
322k
        _Py_RETURN_UNICODE_EMPTY();
1920
1921
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1922
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1923
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1924
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1925
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
1926
        if (!converted) {
1927
            return NULL;
1928
        }
1929
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
1930
        PyMem_Free(converted);
1931
        return unicode;
1932
    }
1933
#endif
1934
1935
    /* Single character Unicode objects in the Latin-1 range are
1936
       shared when using this constructor */
1937
574k
    if (size == 1 && (Py_UCS4)*u < 256)
1938
75.2k
        return get_latin1_char((unsigned char)*u);
1939
1940
    /* If not empty and not single character, copy the Unicode data
1941
       into the new object */
1942
499k
    if (find_maxchar_surrogates(u, u + size,
1943
499k
                                &maxchar, &num_surrogates) == -1)
1944
0
        return NULL;
1945
1946
499k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1947
499k
    if (!unicode)
1948
0
        return NULL;
1949
1950
499k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1951
499k
                           u, size, num_surrogates);
1952
1953
499k
    return unicode_result(unicode);
1954
499k
}
1955
1956
1957
int
1958
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
1959
                              const wchar_t *str,
1960
                              Py_ssize_t size)
1961
0
{
1962
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
1963
1964
0
    if (size < 0) {
1965
0
        size = wcslen(str);
1966
0
    }
1967
1968
0
    if (size == 0) {
1969
0
        return 0;
1970
0
    }
1971
1972
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1973
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1974
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1975
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1976
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
1977
        if (!converted) {
1978
            return -1;
1979
        }
1980
1981
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
1982
        PyMem_Free(converted);
1983
        return res;
1984
    }
1985
#endif
1986
1987
0
    Py_UCS4 maxchar = 0;
1988
0
    Py_ssize_t num_surrogates;
1989
0
    if (find_maxchar_surrogates(str, str + size,
1990
0
                                &maxchar, &num_surrogates) == -1) {
1991
0
        return -1;
1992
0
    }
1993
1994
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
1995
0
        return -1;
1996
0
    }
1997
1998
0
    int kind = writer->kind;
1999
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2000
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
2001
2002
0
    writer->pos += size - num_surrogates;
2003
0
    return 0;
2004
0
}
2005
2006
2007
PyObject *
2008
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2009
7.40M
{
2010
7.40M
    if (size < 0) {
2011
0
        PyErr_SetString(PyExc_SystemError,
2012
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2013
0
        return NULL;
2014
0
    }
2015
7.40M
    if (u != NULL) {
2016
7.40M
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2017
7.40M
    }
2018
0
    if (size > 0) {
2019
0
        PyErr_SetString(PyExc_SystemError,
2020
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2021
0
        return NULL;
2022
0
    }
2023
0
    return _PyUnicode_GetEmpty();
2024
0
}
2025
2026
PyObject *
2027
PyUnicode_FromString(const char *u)
2028
21.6M
{
2029
21.6M
    size_t size = strlen(u);
2030
21.6M
    if (size > PY_SSIZE_T_MAX) {
2031
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2032
0
        return NULL;
2033
0
    }
2034
21.6M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2035
21.6M
}
2036
2037
2038
PyObject *
2039
_PyUnicode_FromId(_Py_Identifier *id)
2040
0
{
2041
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2042
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2043
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2044
2045
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2046
0
    if (index < 0) {
2047
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2048
2049
0
        PyMutex_Lock(&rt_ids->mutex);
2050
        // Check again to detect concurrent access. Another thread can have
2051
        // initialized the index while this thread waited for the lock.
2052
0
        index = _Py_atomic_load_ssize(&id->index);
2053
0
        if (index < 0) {
2054
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2055
0
            index = rt_ids->next_index;
2056
0
            rt_ids->next_index++;
2057
0
            _Py_atomic_store_ssize(&id->index, index);
2058
0
        }
2059
0
        PyMutex_Unlock(&rt_ids->mutex);
2060
0
    }
2061
0
    assert(index >= 0);
2062
2063
0
    PyObject *obj;
2064
0
    if (index < ids->size) {
2065
0
        obj = ids->array[index];
2066
0
        if (obj) {
2067
            // Return a borrowed reference
2068
0
            goto end;
2069
0
        }
2070
0
    }
2071
2072
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2073
0
                                       NULL, NULL);
2074
0
    if (!obj) {
2075
0
        goto end;
2076
0
    }
2077
0
    _PyUnicode_InternImmortal(interp, &obj);
2078
2079
0
    if (index >= ids->size) {
2080
        // Overallocate to reduce the number of realloc
2081
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2082
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2083
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2084
0
        if (new_array == NULL) {
2085
0
            PyErr_NoMemory();
2086
0
            obj = NULL;
2087
0
            goto end;
2088
0
        }
2089
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2090
0
        ids->array = new_array;
2091
0
        ids->size = new_size;
2092
0
    }
2093
2094
    // The array stores a strong reference
2095
0
    ids->array[index] = obj;
2096
2097
0
end:
2098
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2099
    // Return a borrowed reference
2100
0
    return obj;
2101
0
}
2102
2103
2104
static void
2105
unicode_clear_identifiers(struct _Py_unicode_state *state)
2106
0
{
2107
0
    struct _Py_unicode_ids *ids = &state->ids;
2108
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2109
0
        Py_XDECREF(ids->array[i]);
2110
0
    }
2111
0
    ids->size = 0;
2112
0
    PyMem_Free(ids->array);
2113
0
    ids->array = NULL;
2114
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2115
    // after Py_Finalize().
2116
0
}
2117
2118
2119
/* Internal function, doesn't check maximum character */
2120
2121
PyObject*
2122
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2123
113M
{
2124
113M
    const unsigned char *s = (const unsigned char *)buffer;
2125
113M
    PyObject *unicode;
2126
113M
    if (size == 1) {
2127
#ifdef Py_DEBUG
2128
        assert((unsigned char)s[0] < 128);
2129
#endif
2130
40.2M
        return get_latin1_char(s[0]);
2131
40.2M
    }
2132
73.5M
    unicode = PyUnicode_New(size, 127);
2133
73.5M
    if (!unicode)
2134
0
        return NULL;
2135
73.5M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2136
73.5M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2137
73.5M
    return unicode;
2138
73.5M
}
2139
2140
static Py_UCS4
2141
kind_maxchar_limit(int kind)
2142
0
{
2143
0
    switch (kind) {
2144
0
    case PyUnicode_1BYTE_KIND:
2145
0
        return 0x80;
2146
0
    case PyUnicode_2BYTE_KIND:
2147
0
        return 0x100;
2148
0
    case PyUnicode_4BYTE_KIND:
2149
0
        return 0x10000;
2150
0
    default:
2151
0
        Py_UNREACHABLE();
2152
0
    }
2153
0
}
2154
2155
static PyObject*
2156
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2157
47.6M
{
2158
47.6M
    PyObject *res;
2159
47.6M
    unsigned char max_char;
2160
2161
47.6M
    if (size == 0) {
2162
7.78M
        _Py_RETURN_UNICODE_EMPTY();
2163
7.78M
    }
2164
47.6M
    assert(size > 0);
2165
39.8M
    if (size == 1) {
2166
10.4M
        return get_latin1_char(u[0]);
2167
10.4M
    }
2168
2169
29.3M
    max_char = ucs1lib_find_max_char(u, u + size);
2170
29.3M
    res = PyUnicode_New(size, max_char);
2171
29.3M
    if (!res)
2172
0
        return NULL;
2173
29.3M
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2174
29.3M
    assert(_PyUnicode_CheckConsistency(res, 1));
2175
29.3M
    return res;
2176
29.3M
}
2177
2178
static PyObject*
2179
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2180
96.3M
{
2181
96.3M
    PyObject *res;
2182
96.3M
    Py_UCS2 max_char;
2183
2184
96.3M
    if (size == 0)
2185
17.5M
        _Py_RETURN_UNICODE_EMPTY();
2186
96.3M
    assert(size > 0);
2187
78.8M
    if (size == 1)
2188
52.0M
        return unicode_char(u[0]);
2189
2190
26.8M
    max_char = ucs2lib_find_max_char(u, u + size);
2191
26.8M
    res = PyUnicode_New(size, max_char);
2192
26.8M
    if (!res)
2193
0
        return NULL;
2194
26.8M
    if (max_char >= 256)
2195
16.8M
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2196
9.93M
    else {
2197
9.93M
        _PyUnicode_CONVERT_BYTES(
2198
9.93M
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2199
9.93M
    }
2200
26.8M
    assert(_PyUnicode_CheckConsistency(res, 1));
2201
26.8M
    return res;
2202
26.8M
}
2203
2204
static PyObject*
2205
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2206
66.6M
{
2207
66.6M
    PyObject *res;
2208
66.6M
    Py_UCS4 max_char;
2209
2210
66.6M
    if (size == 0)
2211
8.23M
        _Py_RETURN_UNICODE_EMPTY();
2212
66.6M
    assert(size > 0);
2213
58.4M
    if (size == 1)
2214
39.0M
        return unicode_char(u[0]);
2215
2216
19.3M
    max_char = ucs4lib_find_max_char(u, u + size);
2217
19.3M
    res = PyUnicode_New(size, max_char);
2218
19.3M
    if (!res)
2219
0
        return NULL;
2220
19.3M
    if (max_char < 256)
2221
13.5M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2222
19.3M
                                 PyUnicode_1BYTE_DATA(res));
2223
5.77M
    else if (max_char < 0x10000)
2224
4.39M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2225
5.77M
                                 PyUnicode_2BYTE_DATA(res));
2226
1.37M
    else
2227
1.37M
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2228
19.3M
    assert(_PyUnicode_CheckConsistency(res, 1));
2229
19.3M
    return res;
2230
19.3M
}
2231
2232
2233
int
2234
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2235
                          const Py_UCS4 *str,
2236
                          Py_ssize_t size)
2237
0
{
2238
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2239
2240
0
    if (size < 0) {
2241
0
        PyErr_SetString(PyExc_ValueError,
2242
0
                        "size must be positive");
2243
0
        return -1;
2244
0
    }
2245
2246
0
    if (size == 0) {
2247
0
        return 0;
2248
0
    }
2249
2250
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2251
2252
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2253
0
        return -1;
2254
0
    }
2255
2256
0
    int kind = writer->kind;
2257
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2258
0
    if (kind == PyUnicode_1BYTE_KIND) {
2259
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2260
0
                                 str, str + size,
2261
0
                                 data);
2262
0
    }
2263
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2264
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2265
0
                                 str, str + size,
2266
0
                                 data);
2267
0
    }
2268
0
    else {
2269
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2270
0
    }
2271
0
    writer->pos += size;
2272
2273
0
    return 0;
2274
0
}
2275
2276
2277
PyObject*
2278
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2279
155M
{
2280
155M
    if (size < 0) {
2281
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2282
0
        return NULL;
2283
0
    }
2284
155M
    switch (kind) {
2285
21.8M
    case PyUnicode_1BYTE_KIND:
2286
21.8M
        return _PyUnicode_FromUCS1(buffer, size);
2287
78.1M
    case PyUnicode_2BYTE_KIND:
2288
78.1M
        return _PyUnicode_FromUCS2(buffer, size);
2289
55.4M
    case PyUnicode_4BYTE_KIND:
2290
55.4M
        return _PyUnicode_FromUCS4(buffer, size);
2291
0
    default:
2292
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2293
0
        return NULL;
2294
155M
    }
2295
155M
}
2296
2297
Py_UCS4
2298
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2299
11.3M
{
2300
11.3M
    int kind;
2301
11.3M
    const void *startptr, *endptr;
2302
2303
11.3M
    assert(0 <= start);
2304
11.3M
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2305
11.3M
    assert(start <= end);
2306
2307
11.3M
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2308
78.3k
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2309
2310
11.2M
    if (start == end)
2311
0
        return 127;
2312
2313
11.2M
    if (PyUnicode_IS_ASCII(unicode))
2314
11.2M
        return 127;
2315
2316
27.5k
    kind = PyUnicode_KIND(unicode);
2317
27.5k
    startptr = PyUnicode_DATA(unicode);
2318
27.5k
    endptr = (char *)startptr + end * kind;
2319
27.5k
    startptr = (char *)startptr + start * kind;
2320
27.5k
    switch(kind) {
2321
5.23k
    case PyUnicode_1BYTE_KIND:
2322
5.23k
        return ucs1lib_find_max_char(startptr, endptr);
2323
4.25k
    case PyUnicode_2BYTE_KIND:
2324
4.25k
        return ucs2lib_find_max_char(startptr, endptr);
2325
18.1k
    case PyUnicode_4BYTE_KIND:
2326
18.1k
        return ucs4lib_find_max_char(startptr, endptr);
2327
0
    default:
2328
0
        Py_UNREACHABLE();
2329
27.5k
    }
2330
27.5k
}
2331
2332
/* Ensure that a string uses the most efficient storage, if it is not the
2333
   case: create a new string with of the right kind. Write NULL into *p_unicode
2334
   on error. */
2335
static void
2336
unicode_adjust_maxchar(PyObject **p_unicode)
2337
0
{
2338
0
    PyObject *unicode, *copy;
2339
0
    Py_UCS4 max_char;
2340
0
    Py_ssize_t len;
2341
0
    int kind;
2342
2343
0
    assert(p_unicode != NULL);
2344
0
    unicode = *p_unicode;
2345
0
    if (PyUnicode_IS_ASCII(unicode))
2346
0
        return;
2347
2348
0
    len = PyUnicode_GET_LENGTH(unicode);
2349
0
    kind = PyUnicode_KIND(unicode);
2350
0
    if (kind == PyUnicode_1BYTE_KIND) {
2351
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2352
0
        max_char = ucs1lib_find_max_char(u, u + len);
2353
0
        if (max_char >= 128)
2354
0
            return;
2355
0
    }
2356
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2357
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2358
0
        max_char = ucs2lib_find_max_char(u, u + len);
2359
0
        if (max_char >= 256)
2360
0
            return;
2361
0
    }
2362
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2363
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2364
0
        max_char = ucs4lib_find_max_char(u, u + len);
2365
0
        if (max_char >= 0x10000)
2366
0
            return;
2367
0
    }
2368
0
    else
2369
0
        Py_UNREACHABLE();
2370
2371
0
    copy = PyUnicode_New(len, max_char);
2372
0
    if (copy != NULL)
2373
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2374
0
    Py_DECREF(unicode);
2375
0
    *p_unicode = copy;
2376
0
}
2377
2378
PyObject*
2379
_PyUnicode_Copy(PyObject *unicode)
2380
2.99M
{
2381
2.99M
    Py_ssize_t length;
2382
2.99M
    PyObject *copy;
2383
2384
2.99M
    if (!PyUnicode_Check(unicode)) {
2385
0
        PyErr_BadInternalCall();
2386
0
        return NULL;
2387
0
    }
2388
2389
2.99M
    length = PyUnicode_GET_LENGTH(unicode);
2390
2.99M
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2391
2.99M
    if (!copy)
2392
0
        return NULL;
2393
2.99M
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2394
2395
2.99M
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2396
2.99M
              length * PyUnicode_KIND(unicode));
2397
2.99M
    assert(_PyUnicode_CheckConsistency(copy, 1));
2398
2.99M
    return copy;
2399
2.99M
}
2400
2401
2402
/* Widen Unicode objects to larger buffers. Don't write terminating null
2403
   character. Return NULL on error. */
2404
2405
static void*
2406
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2407
7.47M
{
2408
7.47M
    void *result;
2409
2410
7.47M
    assert(skind < kind);
2411
7.47M
    switch (kind) {
2412
4.84M
    case PyUnicode_2BYTE_KIND:
2413
4.84M
        result = PyMem_New(Py_UCS2, len);
2414
4.84M
        if (!result)
2415
0
            return PyErr_NoMemory();
2416
4.84M
        assert(skind == PyUnicode_1BYTE_KIND);
2417
4.84M
        _PyUnicode_CONVERT_BYTES(
2418
4.84M
            Py_UCS1, Py_UCS2,
2419
4.84M
            (const Py_UCS1 *)data,
2420
4.84M
            ((const Py_UCS1 *)data) + len,
2421
4.84M
            result);
2422
4.84M
        return result;
2423
2.63M
    case PyUnicode_4BYTE_KIND:
2424
2.63M
        result = PyMem_New(Py_UCS4, len);
2425
2.63M
        if (!result)
2426
0
            return PyErr_NoMemory();
2427
2.63M
        if (skind == PyUnicode_2BYTE_KIND) {
2428
0
            _PyUnicode_CONVERT_BYTES(
2429
0
                Py_UCS2, Py_UCS4,
2430
0
                (const Py_UCS2 *)data,
2431
0
                ((const Py_UCS2 *)data) + len,
2432
0
                result);
2433
0
        }
2434
2.63M
        else {
2435
2.63M
            assert(skind == PyUnicode_1BYTE_KIND);
2436
2.63M
            _PyUnicode_CONVERT_BYTES(
2437
2.63M
                Py_UCS1, Py_UCS4,
2438
2.63M
                (const Py_UCS1 *)data,
2439
2.63M
                ((const Py_UCS1 *)data) + len,
2440
2.63M
                result);
2441
2.63M
        }
2442
2.63M
        return result;
2443
0
    default:
2444
0
        Py_UNREACHABLE();
2445
0
        return NULL;
2446
7.47M
    }
2447
7.47M
}
2448
2449
static Py_UCS4*
2450
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2451
        int copy_null)
2452
73.4k
{
2453
73.4k
    int kind;
2454
73.4k
    const void *data;
2455
73.4k
    Py_ssize_t len, targetlen;
2456
73.4k
    kind = PyUnicode_KIND(string);
2457
73.4k
    data = PyUnicode_DATA(string);
2458
73.4k
    len = PyUnicode_GET_LENGTH(string);
2459
73.4k
    targetlen = len;
2460
73.4k
    if (copy_null)
2461
0
        targetlen++;
2462
73.4k
    if (!target) {
2463
0
        target = PyMem_New(Py_UCS4, targetlen);
2464
0
        if (!target) {
2465
0
            PyErr_NoMemory();
2466
0
            return NULL;
2467
0
        }
2468
0
    }
2469
73.4k
    else {
2470
73.4k
        if (targetsize < targetlen) {
2471
0
            PyErr_Format(PyExc_SystemError,
2472
0
                         "string is longer than the buffer");
2473
0
            if (copy_null && 0 < targetsize)
2474
0
                target[0] = 0;
2475
0
            return NULL;
2476
0
        }
2477
73.4k
    }
2478
73.4k
    if (kind == PyUnicode_1BYTE_KIND) {
2479
51.3k
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2480
51.3k
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2481
51.3k
    }
2482
22.0k
    else if (kind == PyUnicode_2BYTE_KIND) {
2483
15.7k
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2484
15.7k
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2485
15.7k
    }
2486
6.32k
    else if (kind == PyUnicode_4BYTE_KIND) {
2487
6.32k
        memcpy(target, data, len * sizeof(Py_UCS4));
2488
6.32k
    }
2489
0
    else {
2490
0
        Py_UNREACHABLE();
2491
0
    }
2492
73.4k
    if (copy_null)
2493
0
        target[len] = 0;
2494
73.4k
    return target;
2495
73.4k
}
2496
2497
Py_UCS4*
2498
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2499
                 int copy_null)
2500
73.4k
{
2501
73.4k
    if (target == NULL || targetsize < 0) {
2502
0
        PyErr_BadInternalCall();
2503
0
        return NULL;
2504
0
    }
2505
73.4k
    return as_ucs4(string, target, targetsize, copy_null);
2506
73.4k
}
2507
2508
Py_UCS4*
2509
PyUnicode_AsUCS4Copy(PyObject *string)
2510
0
{
2511
0
    return as_ucs4(string, NULL, 0, 1);
2512
0
}
2513
2514
/* maximum number of characters required for output of %jo or %jd or %p.
2515
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2516
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2517
   plus 1 for the terminal NUL. */
2518
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2519
2520
static int
2521
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2522
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2523
7.89M
{
2524
7.89M
    Py_ssize_t length, fill, arglen;
2525
7.89M
    Py_UCS4 maxchar;
2526
2527
7.89M
    length = PyUnicode_GET_LENGTH(str);
2528
7.89M
    if ((precision == -1 || precision >= length)
2529
7.89M
        && width <= length)
2530
7.89M
        return _PyUnicodeWriter_WriteStr(writer, str);
2531
2532
48
    if (precision != -1)
2533
48
        length = Py_MIN(precision, length);
2534
2535
48
    arglen = Py_MAX(length, width);
2536
48
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2537
19
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2538
29
    else
2539
29
        maxchar = writer->maxchar;
2540
2541
48
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2542
0
        return -1;
2543
2544
48
    fill = Py_MAX(width - length, 0);
2545
48
    if (fill && !(flags & F_LJUST)) {
2546
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2547
0
            return -1;
2548
0
        writer->pos += fill;
2549
0
    }
2550
2551
48
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2552
48
                                  str, 0, length);
2553
48
    writer->pos += length;
2554
2555
48
    if (fill && (flags & F_LJUST)) {
2556
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2557
0
            return -1;
2558
0
        writer->pos += fill;
2559
0
    }
2560
2561
48
    return 0;
2562
48
}
2563
2564
static int
2565
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2566
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2567
3.27M
{
2568
    /* UTF-8 */
2569
3.27M
    Py_ssize_t *pconsumed = NULL;
2570
3.27M
    Py_ssize_t length;
2571
3.27M
    if (precision == -1) {
2572
848k
        length = strlen(str);
2573
848k
    }
2574
2.43M
    else {
2575
2.43M
        length = 0;
2576
38.5M
        while (length < precision && str[length]) {
2577
36.1M
            length++;
2578
36.1M
        }
2579
2.43M
        if (length == precision) {
2580
            /* The input string is not NUL-terminated.  If it ends with an
2581
             * incomplete UTF-8 sequence, truncate the string just before it.
2582
             * Incomplete sequences in the middle and sequences which cannot
2583
             * be valid prefixes are still treated as errors and replaced
2584
             * with \xfffd. */
2585
1.91k
            pconsumed = &length;
2586
1.91k
        }
2587
2.43M
    }
2588
2589
3.27M
    if (width < 0) {
2590
3.27M
        return _PyUnicode_DecodeUTF8Writer(writer, str, length,
2591
3.27M
                                           _Py_ERROR_REPLACE, "replace", pconsumed);
2592
3.27M
    }
2593
2594
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2595
0
                                                     "replace", pconsumed);
2596
0
    if (unicode == NULL)
2597
0
        return -1;
2598
2599
0
    int res = unicode_fromformat_write_str(writer, unicode,
2600
0
                                           width, -1, flags);
2601
0
    Py_DECREF(unicode);
2602
0
    return res;
2603
0
}
2604
2605
static int
2606
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2607
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2608
0
{
2609
0
    Py_ssize_t length;
2610
0
    if (precision == -1) {
2611
0
        length = wcslen(str);
2612
0
    }
2613
0
    else {
2614
0
        length = 0;
2615
0
        while (length < precision && str[length]) {
2616
0
            length++;
2617
0
        }
2618
0
    }
2619
2620
0
    if (width < 0) {
2621
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2622
0
                                             str, length);
2623
0
    }
2624
2625
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2626
0
    if (unicode == NULL)
2627
0
        return -1;
2628
2629
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2630
0
    Py_DECREF(unicode);
2631
0
    return res;
2632
0
}
2633
2634
0
#define F_LONG 1
2635
0
#define F_LONGLONG 2
2636
218k
#define F_SIZE 3
2637
0
#define F_PTRDIFF 4
2638
0
#define F_INTMAX 5
2639
2640
static const char*
2641
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2642
                       const char *f, va_list *vargs)
2643
25.1M
{
2644
25.1M
    const char *p;
2645
25.1M
    Py_ssize_t len;
2646
25.1M
    int flags = 0;
2647
25.1M
    Py_ssize_t width;
2648
25.1M
    Py_ssize_t precision;
2649
2650
25.1M
    p = f;
2651
25.1M
    f++;
2652
25.1M
    if (*f == '%') {
2653
1.06M
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2654
0
            return NULL;
2655
1.06M
        f++;
2656
1.06M
        return f;
2657
1.06M
    }
2658
2659
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2660
    /* Flags '+', ' ' and '#' are not particularly useful.
2661
     * They are not worth the implementation and maintenance costs.
2662
     * In addition, '#' should add "0" for "o" conversions for compatibility
2663
     * with printf, but it would confuse Python users. */
2664
24.0M
    while (1) {
2665
24.0M
        switch (*f++) {
2666
0
        case '-': flags |= F_LJUST; continue;
2667
1.68k
        case '0': flags |= F_ZERO; continue;
2668
0
        case '#': flags |= F_ALT; continue;
2669
24.0M
        }
2670
24.0M
        f--;
2671
24.0M
        break;
2672
24.0M
    }
2673
2674
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2675
24.0M
    width = -1;
2676
24.0M
    if (*f == '*') {
2677
0
        width = va_arg(*vargs, int);
2678
0
        if (width < 0) {
2679
0
            flags |= F_LJUST;
2680
0
            width = -width;
2681
0
        }
2682
0
        f++;
2683
0
    }
2684
24.0M
    else if (Py_ISDIGIT((unsigned)*f)) {
2685
1.68k
        width = *f - '0';
2686
1.68k
        f++;
2687
1.68k
        while (Py_ISDIGIT((unsigned)*f)) {
2688
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2689
0
                PyErr_SetString(PyExc_ValueError,
2690
0
                                "width too big");
2691
0
                return NULL;
2692
0
            }
2693
0
            width = (width * 10) + (*f - '0');
2694
0
            f++;
2695
0
        }
2696
1.68k
    }
2697
24.0M
    precision = -1;
2698
24.0M
    if (*f == '.') {
2699
5.00M
        f++;
2700
5.00M
        if (*f == '*') {
2701
0
            precision = va_arg(*vargs, int);
2702
0
            if (precision < 0) {
2703
0
                precision = -2;
2704
0
            }
2705
0
            f++;
2706
0
        }
2707
5.00M
        else if (Py_ISDIGIT((unsigned)*f)) {
2708
5.00M
            precision = (*f - '0');
2709
5.00M
            f++;
2710
15.0M
            while (Py_ISDIGIT((unsigned)*f)) {
2711
10.0M
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2712
0
                    PyErr_SetString(PyExc_ValueError,
2713
0
                                    "precision too big");
2714
0
                    return NULL;
2715
0
                }
2716
10.0M
                precision = (precision * 10) + (*f - '0');
2717
10.0M
                f++;
2718
10.0M
            }
2719
5.00M
        }
2720
5.00M
    }
2721
2722
24.0M
    int sizemod = 0;
2723
24.0M
    if (*f == 'l') {
2724
0
        if (f[1] == 'l') {
2725
0
            sizemod = F_LONGLONG;
2726
0
            f += 2;
2727
0
        }
2728
0
        else {
2729
0
            sizemod = F_LONG;
2730
0
            ++f;
2731
0
        }
2732
0
    }
2733
24.0M
    else if (*f == 'z') {
2734
109k
        sizemod = F_SIZE;
2735
109k
        ++f;
2736
109k
    }
2737
23.9M
    else if (*f == 't') {
2738
0
        sizemod = F_PTRDIFF;
2739
0
        ++f;
2740
0
    }
2741
23.9M
    else if (*f == 'j') {
2742
0
        sizemod = F_INTMAX;
2743
0
        ++f;
2744
0
    }
2745
24.0M
    if (f[0] != '\0' && f[1] == '\0')
2746
4.35M
        writer->overallocate = 0;
2747
2748
24.0M
    switch (*f) {
2749
11.2M
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2750
11.2M
        break;
2751
1.69M
    case 'c': case 'p':
2752
1.69M
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2753
1.69M
        break;
2754
3.27M
    case 's':
2755
3.27M
    case 'V':
2756
3.27M
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2757
3.27M
        break;
2758
7.89M
    default:
2759
7.89M
        if (sizemod) goto invalid_format;
2760
7.89M
        break;
2761
24.0M
    }
2762
2763
24.0M
    switch (*f) {
2764
1.68M
    case 'c':
2765
1.68M
    {
2766
1.68M
        int ordinal = va_arg(*vargs, int);
2767
1.68M
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2768
0
            PyErr_SetString(PyExc_OverflowError,
2769
0
                            "character argument not in range(0x110000)");
2770
0
            return NULL;
2771
0
        }
2772
1.68M
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2773
0
            return NULL;
2774
1.68M
        break;
2775
1.68M
    }
2776
2777
11.2M
    case 'd': case 'i':
2778
11.2M
    case 'o': case 'u': case 'x': case 'X':
2779
11.2M
    {
2780
11.2M
        char buffer[MAX_INTMAX_CHARS];
2781
2782
        // Fill buffer using sprinf, with one of many possible format
2783
        // strings, like "%llX" for `long long` in hexadecimal.
2784
        // The type/size is in `sizemod`; the format is in `*f`.
2785
2786
        // Use macros with nested switches to keep the sprintf format strings
2787
        // as compile-time literals, avoiding warnings and maybe allowing
2788
        // optimizations.
2789
2790
        // `SPRINT` macro does one sprintf
2791
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2792
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2793
11.2M
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2794
11.2M
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2795
2796
        // One inner switch to handle all format variants
2797
11.2M
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2798
11.2M
            switch (*f) {                                                     \
2799
96
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2800
17.6k
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2801
1.32k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2802
936
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2803
11.2M
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2804
11.2M
            }
2805
2806
        // Outer switch to handle all the sizes/types
2807
11.2M
        switch (sizemod) {
2808
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2809
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2810
109k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2811
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2812
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2813
11.1M
            default:         DO_SPRINTS("", int, unsigned int); break;
2814
11.2M
        }
2815
11.2M
        #undef SPRINT
2816
11.2M
        #undef DO_SPRINTS
2817
2818
11.2M
        assert(len >= 0);
2819
2820
11.2M
        int sign = (buffer[0] == '-');
2821
11.2M
        len -= sign;
2822
2823
11.2M
        precision = Py_MAX(precision, len);
2824
11.2M
        width = Py_MAX(width, precision + sign);
2825
11.2M
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2826
1.68k
            precision = width - sign;
2827
1.68k
        }
2828
2829
11.2M
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2830
11.2M
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2831
2832
11.2M
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2833
0
            return NULL;
2834
2835
11.2M
        if (spacepad && !(flags & F_LJUST)) {
2836
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2837
0
                return NULL;
2838
0
            writer->pos += spacepad;
2839
0
        }
2840
2841
11.2M
        if (sign) {
2842
824
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2843
0
                return NULL;
2844
824
        }
2845
2846
11.2M
        if (zeropad) {
2847
634
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2848
0
                return NULL;
2849
634
            writer->pos += zeropad;
2850
634
        }
2851
2852
11.2M
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2853
0
            return NULL;
2854
2855
11.2M
        if (spacepad && (flags & F_LJUST)) {
2856
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2857
0
                return NULL;
2858
0
            writer->pos += spacepad;
2859
0
        }
2860
11.2M
        break;
2861
11.2M
    }
2862
2863
11.2M
    case 'p':
2864
2.92k
    {
2865
2.92k
        char number[MAX_INTMAX_CHARS];
2866
2867
2.92k
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2868
2.92k
        assert(len >= 0);
2869
2870
        /* %p is ill-defined:  ensure leading 0x. */
2871
2.92k
        if (number[1] == 'X')
2872
0
            number[1] = 'x';
2873
2.92k
        else if (number[1] != 'x') {
2874
0
            memmove(number + 2, number,
2875
0
                    strlen(number) + 1);
2876
0
            number[0] = '0';
2877
0
            number[1] = 'x';
2878
0
            len += 2;
2879
0
        }
2880
2881
2.92k
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2882
0
            return NULL;
2883
2.92k
        break;
2884
2.92k
    }
2885
2886
3.27M
    case 's':
2887
3.27M
    {
2888
3.27M
        if (sizemod) {
2889
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
2890
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
2891
0
                return NULL;
2892
0
        }
2893
3.27M
        else {
2894
            /* UTF-8 */
2895
3.27M
            const char *s = va_arg(*vargs, const char*);
2896
3.27M
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
2897
0
                return NULL;
2898
3.27M
        }
2899
3.27M
        break;
2900
3.27M
    }
2901
2902
3.63M
    case 'U':
2903
3.63M
    {
2904
3.63M
        PyObject *obj = va_arg(*vargs, PyObject *);
2905
3.63M
        assert(obj && _PyUnicode_CHECK(obj));
2906
2907
3.63M
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2908
0
            return NULL;
2909
3.63M
        break;
2910
3.63M
    }
2911
2912
3.63M
    case 'V':
2913
581
    {
2914
581
        PyObject *obj = va_arg(*vargs, PyObject *);
2915
581
        const char *str;
2916
581
        const wchar_t *wstr;
2917
581
        if (sizemod) {
2918
0
            wstr = va_arg(*vargs, const wchar_t*);
2919
0
        }
2920
581
        else {
2921
581
            str = va_arg(*vargs, const char *);
2922
581
        }
2923
581
        if (obj) {
2924
0
            assert(_PyUnicode_CHECK(obj));
2925
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2926
0
                return NULL;
2927
0
        }
2928
581
        else if (sizemod) {
2929
0
            assert(wstr != NULL);
2930
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
2931
0
                return NULL;
2932
0
        }
2933
581
        else {
2934
581
            assert(str != NULL);
2935
581
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
2936
0
                return NULL;
2937
581
        }
2938
581
        break;
2939
581
    }
2940
2941
1.59k
    case 'S':
2942
1.59k
    {
2943
1.59k
        PyObject *obj = va_arg(*vargs, PyObject *);
2944
1.59k
        PyObject *str;
2945
1.59k
        assert(obj);
2946
1.59k
        str = PyObject_Str(obj);
2947
1.59k
        if (!str)
2948
0
            return NULL;
2949
1.59k
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
2950
0
            Py_DECREF(str);
2951
0
            return NULL;
2952
0
        }
2953
1.59k
        Py_DECREF(str);
2954
1.59k
        break;
2955
1.59k
    }
2956
2957
2.57M
    case 'R':
2958
2.57M
    {
2959
2.57M
        PyObject *obj = va_arg(*vargs, PyObject *);
2960
2.57M
        PyObject *repr;
2961
2.57M
        assert(obj);
2962
2.57M
        repr = PyObject_Repr(obj);
2963
2.57M
        if (!repr)
2964
0
            return NULL;
2965
2.57M
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
2966
0
            Py_DECREF(repr);
2967
0
            return NULL;
2968
0
        }
2969
2.57M
        Py_DECREF(repr);
2970
2.57M
        break;
2971
2.57M
    }
2972
2973
0
    case 'A':
2974
0
    {
2975
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2976
0
        PyObject *ascii;
2977
0
        assert(obj);
2978
0
        ascii = PyObject_ASCII(obj);
2979
0
        if (!ascii)
2980
0
            return NULL;
2981
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
2982
0
            Py_DECREF(ascii);
2983
0
            return NULL;
2984
0
        }
2985
0
        Py_DECREF(ascii);
2986
0
        break;
2987
0
    }
2988
2989
1.67M
    case 'T':
2990
1.67M
    {
2991
1.67M
        PyObject *obj = va_arg(*vargs, PyObject *);
2992
1.67M
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
2993
2994
1.67M
        PyObject *type_name;
2995
1.67M
        if (flags & F_ALT) {
2996
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
2997
0
        }
2998
1.67M
        else {
2999
1.67M
            type_name = PyType_GetFullyQualifiedName(type);
3000
1.67M
        }
3001
1.67M
        Py_DECREF(type);
3002
1.67M
        if (!type_name) {
3003
0
            return NULL;
3004
0
        }
3005
3006
1.67M
        if (unicode_fromformat_write_str(writer, type_name,
3007
1.67M
                                         width, precision, flags) == -1) {
3008
0
            Py_DECREF(type_name);
3009
0
            return NULL;
3010
0
        }
3011
1.67M
        Py_DECREF(type_name);
3012
1.67M
        break;
3013
1.67M
    }
3014
3015
0
    case 'N':
3016
0
    {
3017
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3018
0
        assert(type_raw != NULL);
3019
3020
0
        if (!PyType_Check(type_raw)) {
3021
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3022
0
            return NULL;
3023
0
        }
3024
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3025
3026
0
        PyObject *type_name;
3027
0
        if (flags & F_ALT) {
3028
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3029
0
        }
3030
0
        else {
3031
0
            type_name = PyType_GetFullyQualifiedName(type);
3032
0
        }
3033
0
        if (!type_name) {
3034
0
            return NULL;
3035
0
        }
3036
0
        if (unicode_fromformat_write_str(writer, type_name,
3037
0
                                         width, precision, flags) == -1) {
3038
0
            Py_DECREF(type_name);
3039
0
            return NULL;
3040
0
        }
3041
0
        Py_DECREF(type_name);
3042
0
        break;
3043
0
    }
3044
3045
0
    default:
3046
0
    invalid_format:
3047
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3048
0
        return NULL;
3049
24.0M
    }
3050
3051
24.0M
    f++;
3052
24.0M
    return f;
3053
24.0M
}
3054
3055
static int
3056
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3057
11.8M
{
3058
11.8M
    Py_ssize_t len = strlen(format);
3059
11.8M
    writer->min_length += len + 100;
3060
11.8M
    writer->overallocate = 1;
3061
3062
    // Copy varags to be able to pass a reference to a subfunction.
3063
11.8M
    va_list vargs2;
3064
11.8M
    va_copy(vargs2, vargs);
3065
3066
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3067
    // to be encoded to ASCII.
3068
11.8M
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3069
11.8M
    if (!is_ascii) {
3070
0
        Py_ssize_t i;
3071
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3072
0
        PyErr_Format(PyExc_ValueError,
3073
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3074
0
            "string, got a non-ASCII byte: 0x%02x",
3075
0
            (unsigned char)format[i]);
3076
0
        goto fail;
3077
0
    }
3078
3079
66.7M
    for (const char *f = format; *f; ) {
3080
54.8M
        if (*f == '%') {
3081
25.1M
            f = unicode_fromformat_arg(writer, f, &vargs2);
3082
25.1M
            if (f == NULL)
3083
0
                goto fail;
3084
25.1M
        }
3085
29.6M
        else {
3086
29.6M
            const char *p = strchr(f, '%');
3087
29.6M
            if (p != NULL) {
3088
22.1M
                len = p - f;
3089
22.1M
            }
3090
7.51M
            else {
3091
7.51M
                len = strlen(f);
3092
7.51M
                writer->overallocate = 0;
3093
7.51M
            }
3094
3095
29.6M
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3096
0
                goto fail;
3097
0
            }
3098
29.6M
            f += len;
3099
29.6M
        }
3100
54.8M
    }
3101
11.8M
    va_end(vargs2);
3102
11.8M
    return 0;
3103
3104
0
  fail:
3105
0
    va_end(vargs2);
3106
0
    return -1;
3107
11.8M
}
3108
3109
PyObject *
3110
PyUnicode_FromFormatV(const char *format, va_list vargs)
3111
11.8M
{
3112
11.8M
    _PyUnicodeWriter writer;
3113
11.8M
    _PyUnicodeWriter_Init(&writer);
3114
3115
11.8M
    if (unicode_from_format(&writer, format, vargs) < 0) {
3116
0
        _PyUnicodeWriter_Dealloc(&writer);
3117
0
        return NULL;
3118
0
    }
3119
11.8M
    return _PyUnicodeWriter_Finish(&writer);
3120
11.8M
}
3121
3122
PyObject *
3123
PyUnicode_FromFormat(const char *format, ...)
3124
659k
{
3125
659k
    PyObject* ret;
3126
659k
    va_list vargs;
3127
3128
659k
    va_start(vargs, format);
3129
659k
    ret = PyUnicode_FromFormatV(format, vargs);
3130
659k
    va_end(vargs);
3131
659k
    return ret;
3132
659k
}
3133
3134
int
3135
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3136
0
{
3137
0
    va_list vargs;
3138
0
    va_start(vargs, format);
3139
0
    int res = _PyUnicodeWriter_FormatV(writer, format, vargs);
3140
0
    va_end(vargs);
3141
0
    return res;
3142
0
}
3143
3144
int
3145
_PyUnicodeWriter_FormatV(PyUnicodeWriter *writer, const char *format,
3146
                         va_list vargs)
3147
0
{
3148
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3149
0
    Py_ssize_t old_pos = _writer->pos;
3150
3151
0
    int res = unicode_from_format(_writer, format, vargs);
3152
3153
0
    if (res < 0) {
3154
0
        _writer->pos = old_pos;
3155
0
    }
3156
0
    return res;
3157
0
}
3158
3159
static Py_ssize_t
3160
unicode_get_widechar_size(PyObject *unicode)
3161
211k
{
3162
211k
    Py_ssize_t res;
3163
3164
211k
    assert(unicode != NULL);
3165
211k
    assert(_PyUnicode_CHECK(unicode));
3166
3167
211k
    res = _PyUnicode_LENGTH(unicode);
3168
#if SIZEOF_WCHAR_T == 2
3169
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3170
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3171
        const Py_UCS4 *end = s + res;
3172
        for (; s < end; ++s) {
3173
            if (*s > 0xFFFF) {
3174
                ++res;
3175
            }
3176
        }
3177
    }
3178
#endif
3179
211k
    return res;
3180
211k
}
3181
3182
static void
3183
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3184
211k
{
3185
211k
    assert(unicode != NULL);
3186
211k
    assert(_PyUnicode_CHECK(unicode));
3187
3188
211k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3189
1.31k
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3190
1.31k
        return;
3191
1.31k
    }
3192
3193
209k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3194
176k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3195
4.71M
        for (; size--; ++s, ++w) {
3196
4.53M
            *w = *s;
3197
4.53M
        }
3198
176k
    }
3199
33.0k
    else {
3200
33.0k
#if SIZEOF_WCHAR_T == 4
3201
33.0k
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3202
33.0k
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3203
2.11M
        for (; size--; ++s, ++w) {
3204
2.08M
            *w = *s;
3205
2.08M
        }
3206
#else
3207
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3208
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3209
        for (; size--; ++s, ++w) {
3210
            Py_UCS4 ch = *s;
3211
            if (ch > 0xFFFF) {
3212
                assert(ch <= MAX_UNICODE);
3213
                /* encode surrogate pair in this case */
3214
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3215
                if (!size--)
3216
                    break;
3217
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3218
            }
3219
            else {
3220
                *w = ch;
3221
            }
3222
        }
3223
#endif
3224
33.0k
    }
3225
209k
}
3226
3227
#ifdef HAVE_WCHAR_H
3228
3229
/* Convert a Unicode object to a wide character string.
3230
3231
   - If w is NULL: return the number of wide characters (including the null
3232
     character) required to convert the unicode object. Ignore size argument.
3233
3234
   - Otherwise: return the number of wide characters (excluding the null
3235
     character) written into w. Write at most size wide characters (including
3236
     the null character). */
3237
Py_ssize_t
3238
PyUnicode_AsWideChar(PyObject *unicode,
3239
                     wchar_t *w,
3240
                     Py_ssize_t size)
3241
1.41k
{
3242
1.41k
    Py_ssize_t res;
3243
3244
1.41k
    if (unicode == NULL) {
3245
0
        PyErr_BadInternalCall();
3246
0
        return -1;
3247
0
    }
3248
1.41k
    if (!PyUnicode_Check(unicode)) {
3249
0
        PyErr_BadArgument();
3250
0
        return -1;
3251
0
    }
3252
3253
1.41k
    res = unicode_get_widechar_size(unicode);
3254
1.41k
    if (w == NULL) {
3255
0
        return res + 1;
3256
0
    }
3257
3258
1.41k
    if (size > res) {
3259
1.41k
        size = res + 1;
3260
1.41k
    }
3261
0
    else {
3262
0
        res = size;
3263
0
    }
3264
1.41k
    unicode_copy_as_widechar(unicode, w, size);
3265
3266
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3267
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3268
       non-Unicode locales and hence needs conversion first. */
3269
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3270
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3271
            return -1;
3272
        }
3273
    }
3274
#endif
3275
3276
1.41k
    return res;
3277
1.41k
}
3278
3279
wchar_t*
3280
PyUnicode_AsWideCharString(PyObject *unicode,
3281
                           Py_ssize_t *size)
3282
209k
{
3283
209k
    wchar_t *buffer;
3284
209k
    Py_ssize_t buflen;
3285
3286
209k
    if (unicode == NULL) {
3287
0
        PyErr_BadInternalCall();
3288
0
        return NULL;
3289
0
    }
3290
209k
    if (!PyUnicode_Check(unicode)) {
3291
0
        PyErr_BadArgument();
3292
0
        return NULL;
3293
0
    }
3294
3295
209k
    buflen = unicode_get_widechar_size(unicode);
3296
209k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3297
209k
    if (buffer == NULL) {
3298
0
        PyErr_NoMemory();
3299
0
        return NULL;
3300
0
    }
3301
209k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3302
3303
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3304
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3305
       non-Unicode locales and hence needs conversion first. */
3306
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3307
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3308
            return NULL;
3309
        }
3310
    }
3311
#endif
3312
3313
209k
    if (size != NULL) {
3314
208k
        *size = buflen;
3315
208k
    }
3316
1.14k
    else if (wcslen(buffer) != (size_t)buflen) {
3317
0
        PyMem_Free(buffer);
3318
0
        PyErr_SetString(PyExc_ValueError,
3319
0
                        "embedded null character");
3320
0
        return NULL;
3321
0
    }
3322
209k
    return buffer;
3323
209k
}
3324
3325
#endif /* HAVE_WCHAR_H */
3326
3327
int
3328
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3329
0
{
3330
0
    wchar_t **p = (wchar_t **)ptr;
3331
0
    if (obj == NULL) {
3332
0
        PyMem_Free(*p);
3333
0
        *p = NULL;
3334
0
        return 1;
3335
0
    }
3336
0
    if (PyUnicode_Check(obj)) {
3337
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3338
0
        if (*p == NULL) {
3339
0
            return 0;
3340
0
        }
3341
0
        return Py_CLEANUP_SUPPORTED;
3342
0
    }
3343
0
    PyErr_Format(PyExc_TypeError,
3344
0
                 "argument must be str, not %.50s",
3345
0
                 Py_TYPE(obj)->tp_name);
3346
0
    return 0;
3347
0
}
3348
3349
int
3350
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3351
0
{
3352
0
    wchar_t **p = (wchar_t **)ptr;
3353
0
    if (obj == NULL) {
3354
0
        PyMem_Free(*p);
3355
0
        *p = NULL;
3356
0
        return 1;
3357
0
    }
3358
0
    if (obj == Py_None) {
3359
0
        *p = NULL;
3360
0
        return 1;
3361
0
    }
3362
0
    if (PyUnicode_Check(obj)) {
3363
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3364
0
        if (*p == NULL) {
3365
0
            return 0;
3366
0
        }
3367
0
        return Py_CLEANUP_SUPPORTED;
3368
0
    }
3369
0
    PyErr_Format(PyExc_TypeError,
3370
0
                 "argument must be str or None, not %.50s",
3371
0
                 Py_TYPE(obj)->tp_name);
3372
0
    return 0;
3373
0
}
3374
3375
PyObject *
3376
PyUnicode_FromOrdinal(int ordinal)
3377
9.19M
{
3378
9.19M
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3379
34
        PyErr_SetString(PyExc_ValueError,
3380
34
                        "chr() arg not in range(0x110000)");
3381
34
        return NULL;
3382
34
    }
3383
3384
9.19M
    return unicode_char((Py_UCS4)ordinal);
3385
9.19M
}
3386
3387
PyObject *
3388
PyUnicode_FromObject(PyObject *obj)
3389
2.20M
{
3390
    /* XXX Perhaps we should make this API an alias of
3391
       PyObject_Str() instead ?! */
3392
2.20M
    if (PyUnicode_CheckExact(obj)) {
3393
2.20M
        return Py_NewRef(obj);
3394
2.20M
    }
3395
0
    if (PyUnicode_Check(obj)) {
3396
        /* For a Unicode subtype that's not a Unicode object,
3397
           return a true Unicode object with the same data. */
3398
0
        return _PyUnicode_Copy(obj);
3399
0
    }
3400
0
    PyErr_Format(PyExc_TypeError,
3401
0
                 "Can't convert '%.100s' object to str implicitly",
3402
0
                 Py_TYPE(obj)->tp_name);
3403
0
    return NULL;
3404
0
}
3405
3406
PyObject *
3407
PyUnicode_FromEncodedObject(PyObject *obj,
3408
                            const char *encoding,
3409
                            const char *errors)
3410
21.3M
{
3411
21.3M
    Py_buffer buffer;
3412
21.3M
    PyObject *v;
3413
3414
21.3M
    if (obj == NULL) {
3415
0
        PyErr_BadInternalCall();
3416
0
        return NULL;
3417
0
    }
3418
3419
    /* Decoding bytes objects is the most common case and should be fast */
3420
21.3M
    if (PyBytes_Check(obj)) {
3421
20.9M
        if (PyBytes_GET_SIZE(obj) == 0) {
3422
2.40M
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3423
0
                return NULL;
3424
0
            }
3425
2.40M
            _Py_RETURN_UNICODE_EMPTY();
3426
2.40M
        }
3427
18.5M
        return PyUnicode_Decode(
3428
18.5M
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3429
18.5M
                encoding, errors);
3430
20.9M
    }
3431
3432
464k
    if (PyUnicode_Check(obj)) {
3433
0
        PyErr_SetString(PyExc_TypeError,
3434
0
                        "decoding str is not supported");
3435
0
        return NULL;
3436
0
    }
3437
3438
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3439
464k
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3440
0
        PyErr_Format(PyExc_TypeError,
3441
0
                     "decoding to str: need a bytes-like object, %.80s found",
3442
0
                     Py_TYPE(obj)->tp_name);
3443
0
        return NULL;
3444
0
    }
3445
3446
464k
    if (buffer.len == 0) {
3447
0
        PyBuffer_Release(&buffer);
3448
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3449
0
            return NULL;
3450
0
        }
3451
0
        _Py_RETURN_UNICODE_EMPTY();
3452
0
    }
3453
3454
464k
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3455
464k
    PyBuffer_Release(&buffer);
3456
464k
    return v;
3457
464k
}
3458
3459
/* Normalize an encoding name like encodings.normalize_encoding()
3460
   but allow to convert to lowercase if *to_lower* is true.
3461
   Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
3462
int
3463
_Py_normalize_encoding(const char *encoding,
3464
                       char *lower,
3465
                       size_t lower_len,
3466
                       int to_lower)
3467
25.7M
{
3468
25.7M
    const char *e;
3469
25.7M
    char *l;
3470
25.7M
    char *l_end;
3471
25.7M
    int punct;
3472
3473
25.7M
    assert(encoding != NULL);
3474
3475
25.7M
    e = encoding;
3476
25.7M
    l = lower;
3477
25.7M
    l_end = &lower[lower_len - 1];
3478
25.7M
    punct = 0;
3479
169M
    while (1) {
3480
169M
        char c = *e;
3481
169M
        if (c == 0) {
3482
25.0M
            break;
3483
25.0M
        }
3484
3485
144M
        if (Py_ISALNUM(c) || c == '.') {
3486
128M
            if (punct && l != lower) {
3487
11.9M
                if (l == l_end) {
3488
1.39k
                    return 0;
3489
1.39k
                }
3490
11.9M
                *l++ = '_';
3491
11.9M
            }
3492
128M
            punct = 0;
3493
3494
128M
            if (l == l_end) {
3495
632k
                return 0;
3496
632k
            }
3497
128M
            *l++ = to_lower ? Py_TOLOWER(c) : c;
3498
128M
        }
3499
15.5M
        else {
3500
15.5M
            punct = 1;
3501
15.5M
        }
3502
3503
143M
        e++;
3504
143M
    }
3505
25.0M
    *l = '\0';
3506
25.0M
    return 1;
3507
25.7M
}
3508
3509
PyObject *
3510
PyUnicode_Decode(const char *s,
3511
                 Py_ssize_t size,
3512
                 const char *encoding,
3513
                 const char *errors)
3514
19.0M
{
3515
19.0M
    PyObject *buffer = NULL, *unicode;
3516
19.0M
    Py_buffer info;
3517
19.0M
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3518
3519
19.0M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3520
0
        return NULL;
3521
0
    }
3522
3523
19.0M
    if (size == 0) {
3524
0
        _Py_RETURN_UNICODE_EMPTY();
3525
0
    }
3526
3527
19.0M
    if (encoding == NULL) {
3528
43.4k
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3529
43.4k
    }
3530
3531
    /* Shortcuts for common default encodings */
3532
19.0M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3533
19.0M
        char *lower = buflower;
3534
3535
        /* Fast paths */
3536
19.0M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3537
3.29M
            lower += 3;
3538
3.29M
            if (*lower == '_') {
3539
                /* Match "utf8" and "utf_8" */
3540
3.29M
                lower++;
3541
3.29M
            }
3542
3543
3.29M
            if (lower[0] == '8' && lower[1] == 0) {
3544
3.29M
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3545
3.29M
            }
3546
1.16k
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3547
172
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3548
172
            }
3549
988
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3550
158
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3551
158
            }
3552
3.29M
        }
3553
15.7M
        else {
3554
15.7M
            if (strcmp(lower, "ascii") == 0
3555
11.6M
                || strcmp(lower, "us_ascii") == 0) {
3556
11.6M
                return PyUnicode_DecodeASCII(s, size, errors);
3557
11.6M
            }
3558
    #ifdef MS_WINDOWS
3559
            else if (strcmp(lower, "mbcs") == 0) {
3560
                return PyUnicode_DecodeMBCS(s, size, errors);
3561
            }
3562
    #endif
3563
4.02M
            else if (strcmp(lower, "latin1") == 0
3564
4.02M
                     || strcmp(lower, "latin_1") == 0
3565
1.16M
                     || strcmp(lower, "iso_8859_1") == 0
3566
2.88M
                     || strcmp(lower, "iso8859_1") == 0) {
3567
2.88M
                return PyUnicode_DecodeLatin1(s, size, errors);
3568
2.88M
            }
3569
15.7M
        }
3570
19.0M
    }
3571
3572
    /* Decode via the codec registry */
3573
1.15M
    buffer = NULL;
3574
1.15M
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3575
0
        goto onError;
3576
1.15M
    buffer = PyMemoryView_FromBuffer(&info);
3577
1.15M
    if (buffer == NULL)
3578
0
        goto onError;
3579
1.15M
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3580
1.15M
    if (unicode == NULL)
3581
71.7k
        goto onError;
3582
1.08M
    if (!PyUnicode_Check(unicode)) {
3583
0
        PyErr_Format(PyExc_TypeError,
3584
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3585
0
                     "use codecs.decode() to decode to arbitrary types",
3586
0
                     encoding,
3587
0
                     Py_TYPE(unicode)->tp_name);
3588
0
        Py_DECREF(unicode);
3589
0
        goto onError;
3590
0
    }
3591
1.08M
    Py_DECREF(buffer);
3592
1.08M
    return unicode_result(unicode);
3593
3594
71.7k
  onError:
3595
71.7k
    Py_XDECREF(buffer);
3596
71.7k
    return NULL;
3597
1.08M
}
3598
3599
PyAPI_FUNC(PyObject *)
3600
PyUnicode_AsDecodedObject(PyObject *unicode,
3601
                          const char *encoding,
3602
                          const char *errors)
3603
0
{
3604
0
    if (!PyUnicode_Check(unicode)) {
3605
0
        PyErr_BadArgument();
3606
0
        return NULL;
3607
0
    }
3608
3609
0
    if (encoding == NULL)
3610
0
        encoding = PyUnicode_GetDefaultEncoding();
3611
3612
    /* Decode via the codec registry */
3613
0
    return PyCodec_Decode(unicode, encoding, errors);
3614
0
}
3615
3616
PyAPI_FUNC(PyObject *)
3617
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3618
                           const char *encoding,
3619
                           const char *errors)
3620
0
{
3621
0
    PyObject *v;
3622
3623
0
    if (!PyUnicode_Check(unicode)) {
3624
0
        PyErr_BadArgument();
3625
0
        goto onError;
3626
0
    }
3627
3628
0
    if (encoding == NULL)
3629
0
        encoding = PyUnicode_GetDefaultEncoding();
3630
3631
    /* Decode via the codec registry */
3632
0
    v = PyCodec_Decode(unicode, encoding, errors);
3633
0
    if (v == NULL)
3634
0
        goto onError;
3635
0
    if (!PyUnicode_Check(v)) {
3636
0
        PyErr_Format(PyExc_TypeError,
3637
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3638
0
                     "use codecs.decode() to decode to arbitrary types",
3639
0
                     encoding,
3640
0
                     Py_TYPE(unicode)->tp_name);
3641
0
        Py_DECREF(v);
3642
0
        goto onError;
3643
0
    }
3644
0
    return unicode_result(v);
3645
3646
0
  onError:
3647
0
    return NULL;
3648
0
}
3649
3650
PyAPI_FUNC(PyObject *)
3651
PyUnicode_AsEncodedObject(PyObject *unicode,
3652
                          const char *encoding,
3653
                          const char *errors)
3654
0
{
3655
0
    PyObject *v;
3656
3657
0
    if (!PyUnicode_Check(unicode)) {
3658
0
        PyErr_BadArgument();
3659
0
        goto onError;
3660
0
    }
3661
3662
0
    if (encoding == NULL)
3663
0
        encoding = PyUnicode_GetDefaultEncoding();
3664
3665
    /* Encode via the codec registry */
3666
0
    v = PyCodec_Encode(unicode, encoding, errors);
3667
0
    if (v == NULL)
3668
0
        goto onError;
3669
0
    return v;
3670
3671
0
  onError:
3672
0
    return NULL;
3673
0
}
3674
3675
3676
static PyObject *
3677
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3678
                      int current_locale)
3679
0
{
3680
0
    Py_ssize_t wlen;
3681
0
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3682
0
    if (wstr == NULL) {
3683
0
        return NULL;
3684
0
    }
3685
3686
0
    if ((size_t)wlen != wcslen(wstr)) {
3687
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3688
0
        PyMem_Free(wstr);
3689
0
        return NULL;
3690
0
    }
3691
3692
0
    char *str;
3693
0
    size_t error_pos;
3694
0
    const char *reason;
3695
0
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3696
0
                                 current_locale, error_handler);
3697
0
    PyMem_Free(wstr);
3698
3699
0
    if (res != 0) {
3700
0
        if (res == -2) {
3701
0
            PyObject *exc;
3702
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3703
0
                    "locale", unicode,
3704
0
                    (Py_ssize_t)error_pos,
3705
0
                    (Py_ssize_t)(error_pos+1),
3706
0
                    reason);
3707
0
            if (exc != NULL) {
3708
0
                PyCodec_StrictErrors(exc);
3709
0
                Py_DECREF(exc);
3710
0
            }
3711
0
        }
3712
0
        else if (res == -3) {
3713
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3714
0
        }
3715
0
        else {
3716
0
            PyErr_NoMemory();
3717
0
        }
3718
0
        return NULL;
3719
0
    }
3720
3721
0
    PyObject *bytes = PyBytes_FromString(str);
3722
0
    PyMem_RawFree(str);
3723
0
    return bytes;
3724
0
}
3725
3726
PyObject *
3727
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3728
0
{
3729
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3730
0
    return unicode_encode_locale(unicode, error_handler, 1);
3731
0
}
3732
3733
PyObject *
3734
PyUnicode_EncodeFSDefault(PyObject *unicode)
3735
957k
{
3736
957k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3737
957k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3738
957k
    if (fs_codec->utf8) {
3739
957k
        return unicode_encode_utf8(unicode,
3740
957k
                                   fs_codec->error_handler,
3741
957k
                                   fs_codec->errors);
3742
957k
    }
3743
0
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3744
0
    else if (fs_codec->encoding) {
3745
0
        return PyUnicode_AsEncodedString(unicode,
3746
0
                                         fs_codec->encoding,
3747
0
                                         fs_codec->errors);
3748
0
    }
3749
0
#endif
3750
0
    else {
3751
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3752
           machinery is not ready and so cannot be used:
3753
           use wcstombs() in this case. */
3754
0
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3755
0
        const wchar_t *filesystem_errors = config->filesystem_errors;
3756
0
        assert(filesystem_errors != NULL);
3757
0
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3758
0
        assert(errors != _Py_ERROR_UNKNOWN);
3759
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3760
        return unicode_encode_utf8(unicode, errors, NULL);
3761
#else
3762
0
        return unicode_encode_locale(unicode, errors, 0);
3763
0
#endif
3764
0
    }
3765
957k
}
3766
3767
PyObject *
3768
PyUnicode_AsEncodedString(PyObject *unicode,
3769
                          const char *encoding,
3770
                          const char *errors)
3771
18.6M
{
3772
18.6M
    PyObject *v;
3773
18.6M
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3774
3775
18.6M
    if (!PyUnicode_Check(unicode)) {
3776
0
        PyErr_BadArgument();
3777
0
        return NULL;
3778
0
    }
3779
3780
18.6M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3781
0
        return NULL;
3782
0
    }
3783
3784
18.6M
    if (encoding == NULL) {
3785
11.9M
        return _PyUnicode_AsUTF8String(unicode, errors);
3786
11.9M
    }
3787
3788
    /* Shortcuts for common default encodings */
3789
6.67M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3790
6.04M
        char *lower = buflower;
3791
3792
        /* Fast paths */
3793
6.04M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3794
4.94M
            lower += 3;
3795
4.94M
            if (*lower == '_') {
3796
                /* Match "utf8" and "utf_8" */
3797
4.94M
                lower++;
3798
4.94M
            }
3799
3800
4.94M
            if (lower[0] == '8' && lower[1] == 0) {
3801
4.94M
                return _PyUnicode_AsUTF8String(unicode, errors);
3802
4.94M
            }
3803
6.75k
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3804
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3805
0
            }
3806
6.75k
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3807
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3808
0
            }
3809
4.94M
        }
3810
1.09M
        else {
3811
1.09M
            if (strcmp(lower, "ascii") == 0
3812
828k
                || strcmp(lower, "us_ascii") == 0) {
3813
828k
                return _PyUnicode_AsASCIIString(unicode, errors);
3814
828k
            }
3815
#ifdef MS_WINDOWS
3816
            else if (strcmp(lower, "mbcs") == 0) {
3817
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3818
            }
3819
#endif
3820
269k
            else if (strcmp(lower, "latin1") == 0 ||
3821
269k
                     strcmp(lower, "latin_1") == 0 ||
3822
269k
                     strcmp(lower, "iso_8859_1") == 0 ||
3823
269k
                     strcmp(lower, "iso8859_1") == 0) {
3824
10
                return _PyUnicode_AsLatin1String(unicode, errors);
3825
10
            }
3826
1.09M
        }
3827
6.04M
    }
3828
3829
    /* Encode via the codec registry */
3830
906k
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3831
906k
    if (v == NULL)
3832
0
        return NULL;
3833
3834
    /* The normal path */
3835
906k
    if (PyBytes_Check(v))
3836
906k
        return v;
3837
3838
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3839
0
    if (PyByteArray_Check(v)) {
3840
0
        int error;
3841
0
        PyObject *b;
3842
3843
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3844
0
            "encoder %s returned bytearray instead of bytes; "
3845
0
            "use codecs.encode() to encode to arbitrary types",
3846
0
            encoding);
3847
0
        if (error) {
3848
0
            Py_DECREF(v);
3849
0
            return NULL;
3850
0
        }
3851
3852
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3853
0
                                      PyByteArray_GET_SIZE(v));
3854
0
        Py_DECREF(v);
3855
0
        return b;
3856
0
    }
3857
3858
0
    PyErr_Format(PyExc_TypeError,
3859
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3860
0
                 "use codecs.encode() to encode to arbitrary types",
3861
0
                 encoding,
3862
0
                 Py_TYPE(v)->tp_name);
3863
0
    Py_DECREF(v);
3864
0
    return NULL;
3865
0
}
3866
3867
PyAPI_FUNC(PyObject *)
3868
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3869
                           const char *encoding,
3870
                           const char *errors)
3871
0
{
3872
0
    PyObject *v;
3873
3874
0
    if (!PyUnicode_Check(unicode)) {
3875
0
        PyErr_BadArgument();
3876
0
        goto onError;
3877
0
    }
3878
3879
0
    if (encoding == NULL)
3880
0
        encoding = PyUnicode_GetDefaultEncoding();
3881
3882
    /* Encode via the codec registry */
3883
0
    v = PyCodec_Encode(unicode, encoding, errors);
3884
0
    if (v == NULL)
3885
0
        goto onError;
3886
0
    if (!PyUnicode_Check(v)) {
3887
0
        PyErr_Format(PyExc_TypeError,
3888
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3889
0
                     "use codecs.encode() to encode to arbitrary types",
3890
0
                     encoding,
3891
0
                     Py_TYPE(v)->tp_name);
3892
0
        Py_DECREF(v);
3893
0
        goto onError;
3894
0
    }
3895
0
    return v;
3896
3897
0
  onError:
3898
0
    return NULL;
3899
0
}
3900
3901
static PyObject*
3902
unicode_decode_locale(const char *str, Py_ssize_t len,
3903
                      _Py_error_handler errors, int current_locale)
3904
336k
{
3905
336k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3906
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3907
0
        return NULL;
3908
0
    }
3909
3910
336k
    wchar_t *wstr;
3911
336k
    size_t wlen;
3912
336k
    const char *reason;
3913
336k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3914
336k
                                 current_locale, errors);
3915
336k
    if (res != 0) {
3916
0
        if (res == -2) {
3917
0
            PyObject *exc;
3918
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3919
0
                                        "locale", str, len,
3920
0
                                        (Py_ssize_t)wlen,
3921
0
                                        (Py_ssize_t)(wlen + 1),
3922
0
                                        reason);
3923
0
            if (exc != NULL) {
3924
0
                PyCodec_StrictErrors(exc);
3925
0
                Py_DECREF(exc);
3926
0
            }
3927
0
        }
3928
0
        else if (res == -3) {
3929
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3930
0
        }
3931
0
        else {
3932
0
            PyErr_NoMemory();
3933
0
        }
3934
0
        return NULL;
3935
0
    }
3936
3937
336k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3938
336k
    PyMem_RawFree(wstr);
3939
336k
    return unicode;
3940
336k
}
3941
3942
PyObject*
3943
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3944
                              const char *errors)
3945
0
{
3946
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3947
0
    return unicode_decode_locale(str, len, error_handler, 1);
3948
0
}
3949
3950
PyObject*
3951
PyUnicode_DecodeLocale(const char *str, const char *errors)
3952
336k
{
3953
336k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
3954
336k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3955
336k
    return unicode_decode_locale(str, size, error_handler, 1);
3956
336k
}
3957
3958
3959
PyObject*
3960
221
PyUnicode_DecodeFSDefault(const char *s) {
3961
221
    Py_ssize_t size = (Py_ssize_t)strlen(s);
3962
221
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3963
221
}
3964
3965
PyObject*
3966
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3967
161k
{
3968
161k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3969
161k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3970
161k
    if (fs_codec->utf8) {
3971
161k
        return unicode_decode_utf8(s, size,
3972
161k
                                   fs_codec->error_handler,
3973
161k
                                   fs_codec->errors,
3974
161k
                                   NULL);
3975
161k
    }
3976
37
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3977
37
    else if (fs_codec->encoding) {
3978
0
        return PyUnicode_Decode(s, size,
3979
0
                                fs_codec->encoding,
3980
0
                                fs_codec->errors);
3981
0
    }
3982
37
#endif
3983
37
    else {
3984
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3985
           machinery is not ready and so cannot be used:
3986
           use mbstowcs() in this case. */
3987
37
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3988
37
        const wchar_t *filesystem_errors = config->filesystem_errors;
3989
37
        assert(filesystem_errors != NULL);
3990
37
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3991
37
        assert(errors != _Py_ERROR_UNKNOWN);
3992
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3993
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
3994
#else
3995
37
        return unicode_decode_locale(s, size, errors, 0);
3996
37
#endif
3997
37
    }
3998
161k
}
3999
4000
4001
int
4002
PyUnicode_FSConverter(PyObject* arg, void* addr)
4003
240k
{
4004
240k
    PyObject *path = NULL;
4005
240k
    PyObject *output = NULL;
4006
240k
    Py_ssize_t size;
4007
240k
    const char *data;
4008
240k
    if (arg == NULL) {
4009
0
        Py_DECREF(*(PyObject**)addr);
4010
0
        *(PyObject**)addr = NULL;
4011
0
        return 1;
4012
0
    }
4013
240k
    path = PyOS_FSPath(arg);
4014
240k
    if (path == NULL) {
4015
0
        return 0;
4016
0
    }
4017
240k
    if (PyBytes_Check(path)) {
4018
0
        output = path;
4019
0
    }
4020
240k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4021
240k
        output = PyUnicode_EncodeFSDefault(path);
4022
240k
        Py_DECREF(path);
4023
240k
        if (!output) {
4024
0
            return 0;
4025
0
        }
4026
240k
        assert(PyBytes_Check(output));
4027
240k
    }
4028
4029
240k
    size = PyBytes_GET_SIZE(output);
4030
240k
    data = PyBytes_AS_STRING(output);
4031
240k
    if ((size_t)size != strlen(data)) {
4032
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4033
0
        Py_DECREF(output);
4034
0
        return 0;
4035
0
    }
4036
240k
    *(PyObject**)addr = output;
4037
240k
    return Py_CLEANUP_SUPPORTED;
4038
240k
}
4039
4040
4041
int
4042
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4043
101k
{
4044
101k
    if (arg == NULL) {
4045
0
        Py_DECREF(*(PyObject**)addr);
4046
0
        *(PyObject**)addr = NULL;
4047
0
        return 1;
4048
0
    }
4049
4050
101k
    PyObject *path = PyOS_FSPath(arg);
4051
101k
    if (path == NULL) {
4052
0
        return 0;
4053
0
    }
4054
4055
101k
    PyObject *output = NULL;
4056
101k
    if (PyUnicode_Check(path)) {
4057
101k
        output = path;
4058
101k
    }
4059
0
    else if (PyBytes_Check(path)) {
4060
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4061
0
                                                  PyBytes_GET_SIZE(path));
4062
0
        Py_DECREF(path);
4063
0
        if (!output) {
4064
0
            return 0;
4065
0
        }
4066
0
    }
4067
0
    else {
4068
0
        PyErr_Format(PyExc_TypeError,
4069
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4070
0
                     Py_TYPE(arg)->tp_name);
4071
0
        Py_DECREF(path);
4072
0
        return 0;
4073
0
    }
4074
4075
101k
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4076
101k
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4077
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4078
0
        Py_DECREF(output);
4079
0
        return 0;
4080
0
    }
4081
101k
    *(PyObject**)addr = output;
4082
101k
    return Py_CLEANUP_SUPPORTED;
4083
101k
}
4084
4085
4086
static int unicode_fill_utf8(PyObject *unicode);
4087
4088
4089
static int
4090
unicode_ensure_utf8(PyObject *unicode)
4091
61.2M
{
4092
61.2M
    int err = 0;
4093
61.2M
    if (PyUnicode_UTF8(unicode) == NULL) {
4094
152k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4095
152k
        if (PyUnicode_UTF8(unicode) == NULL) {
4096
152k
            err = unicode_fill_utf8(unicode);
4097
152k
        }
4098
152k
        Py_END_CRITICAL_SECTION();
4099
152k
    }
4100
61.2M
    return err;
4101
61.2M
}
4102
4103
const char *
4104
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4105
61.2M
{
4106
61.2M
    if (!PyUnicode_Check(unicode)) {
4107
0
        PyErr_BadArgument();
4108
0
        if (psize) {
4109
0
            *psize = -1;
4110
0
        }
4111
0
        return NULL;
4112
0
    }
4113
4114
61.2M
    if (unicode_ensure_utf8(unicode) == -1) {
4115
206
        if (psize) {
4116
206
            *psize = -1;
4117
206
        }
4118
206
        return NULL;
4119
206
    }
4120
4121
61.2M
    if (psize) {
4122
60.9M
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4123
60.9M
    }
4124
61.2M
    return PyUnicode_UTF8(unicode);
4125
61.2M
}
4126
4127
const char *
4128
PyUnicode_AsUTF8(PyObject *unicode)
4129
251k
{
4130
251k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4131
251k
}
4132
4133
const char *
4134
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4135
2.85M
{
4136
2.85M
    Py_ssize_t size;
4137
2.85M
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4138
2.85M
    if (s && strlen(s) != (size_t)size) {
4139
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4140
0
        return NULL;
4141
0
    }
4142
2.85M
    return s;
4143
2.85M
}
4144
4145
/*
4146
PyUnicode_GetSize() has been deprecated since Python 3.3
4147
because it returned length of Py_UNICODE.
4148
4149
But this function is part of stable abi, because it doesn't
4150
include Py_UNICODE in signature and it was not excluded from
4151
stable ABI in PEP 384.
4152
*/
4153
PyAPI_FUNC(Py_ssize_t)
4154
PyUnicode_GetSize(PyObject *unicode)
4155
0
{
4156
0
    PyErr_SetString(PyExc_RuntimeError,
4157
0
                    "PyUnicode_GetSize has been removed.");
4158
0
    return -1;
4159
0
}
4160
4161
Py_ssize_t
4162
PyUnicode_GetLength(PyObject *unicode)
4163
25.7k
{
4164
25.7k
    if (!PyUnicode_Check(unicode)) {
4165
0
        PyErr_BadArgument();
4166
0
        return -1;
4167
0
    }
4168
25.7k
    return PyUnicode_GET_LENGTH(unicode);
4169
25.7k
}
4170
4171
Py_UCS4
4172
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4173
29
{
4174
29
    const void *data;
4175
29
    int kind;
4176
4177
29
    if (!PyUnicode_Check(unicode)) {
4178
0
        PyErr_BadArgument();
4179
0
        return (Py_UCS4)-1;
4180
0
    }
4181
29
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4182
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4183
0
        return (Py_UCS4)-1;
4184
0
    }
4185
29
    data = PyUnicode_DATA(unicode);
4186
29
    kind = PyUnicode_KIND(unicode);
4187
29
    return PyUnicode_READ(kind, data, index);
4188
29
}
4189
4190
int
4191
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4192
0
{
4193
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4194
0
        PyErr_BadArgument();
4195
0
        return -1;
4196
0
    }
4197
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4198
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4199
0
        return -1;
4200
0
    }
4201
0
    if (unicode_check_modifiable(unicode))
4202
0
        return -1;
4203
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4204
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4205
0
        return -1;
4206
0
    }
4207
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4208
0
                    index, ch);
4209
0
    return 0;
4210
0
}
4211
4212
const char *
4213
PyUnicode_GetDefaultEncoding(void)
4214
0
{
4215
0
    return "utf-8";
4216
0
}
4217
4218
/* create or adjust a UnicodeDecodeError */
4219
static void
4220
make_decode_exception(PyObject **exceptionObject,
4221
                      const char *encoding,
4222
                      const char *input, Py_ssize_t length,
4223
                      Py_ssize_t startpos, Py_ssize_t endpos,
4224
                      const char *reason)
4225
2.48M
{
4226
2.48M
    if (*exceptionObject == NULL) {
4227
2.26M
        *exceptionObject = PyUnicodeDecodeError_Create(
4228
2.26M
            encoding, input, length, startpos, endpos, reason);
4229
2.26M
    }
4230
213k
    else {
4231
213k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4232
0
            goto onError;
4233
213k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4234
0
            goto onError;
4235
213k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4236
0
            goto onError;
4237
213k
    }
4238
2.48M
    return;
4239
4240
2.48M
onError:
4241
0
    Py_CLEAR(*exceptionObject);
4242
0
}
4243
4244
#ifdef MS_WINDOWS
4245
static int
4246
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4247
{
4248
    if (newsize > *size) {
4249
        wchar_t *newbuf = *buf;
4250
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4251
            PyErr_NoMemory();
4252
            return -1;
4253
        }
4254
        *buf = newbuf;
4255
    }
4256
    *size = newsize;
4257
    return 0;
4258
}
4259
4260
/* error handling callback helper:
4261
   build arguments, call the callback and check the arguments,
4262
   if no exception occurred, copy the replacement to the output
4263
   and adjust various state variables.
4264
   return 0 on success, -1 on error
4265
*/
4266
4267
static int
4268
unicode_decode_call_errorhandler_wchar(
4269
    const char *errors, PyObject **errorHandler,
4270
    const char *encoding, const char *reason,
4271
    const char **input, const char **inend, Py_ssize_t *startinpos,
4272
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4273
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4274
{
4275
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4276
4277
    PyObject *restuple = NULL;
4278
    PyObject *repunicode = NULL;
4279
    Py_ssize_t outsize;
4280
    Py_ssize_t insize;
4281
    Py_ssize_t requiredsize;
4282
    Py_ssize_t newpos;
4283
    PyObject *inputobj = NULL;
4284
    Py_ssize_t repwlen;
4285
4286
    if (*errorHandler == NULL) {
4287
        *errorHandler = PyCodec_LookupError(errors);
4288
        if (*errorHandler == NULL)
4289
            goto onError;
4290
    }
4291
4292
    make_decode_exception(exceptionObject,
4293
        encoding,
4294
        *input, *inend - *input,
4295
        *startinpos, *endinpos,
4296
        reason);
4297
    if (*exceptionObject == NULL)
4298
        goto onError;
4299
4300
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4301
    if (restuple == NULL)
4302
        goto onError;
4303
    if (!PyTuple_Check(restuple)) {
4304
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4305
        goto onError;
4306
    }
4307
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4308
        goto onError;
4309
4310
    /* Copy back the bytes variables, which might have been modified by the
4311
       callback */
4312
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4313
    if (!inputobj)
4314
        goto onError;
4315
    *input = PyBytes_AS_STRING(inputobj);
4316
    insize = PyBytes_GET_SIZE(inputobj);
4317
    *inend = *input + insize;
4318
    /* we can DECREF safely, as the exception has another reference,
4319
       so the object won't go away. */
4320
    Py_DECREF(inputobj);
4321
4322
    if (newpos<0)
4323
        newpos = insize+newpos;
4324
    if (newpos<0 || newpos>insize) {
4325
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4326
        goto onError;
4327
    }
4328
4329
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4330
    if (repwlen < 0)
4331
        goto onError;
4332
    repwlen--;
4333
    /* need more space? (at least enough for what we
4334
       have+the replacement+the rest of the string (starting
4335
       at the new input position), so we won't have to check space
4336
       when there are no errors in the rest of the string) */
4337
    requiredsize = *outpos;
4338
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4339
        goto overflow;
4340
    requiredsize += repwlen;
4341
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4342
        goto overflow;
4343
    requiredsize += insize - newpos;
4344
    outsize = *bufsize;
4345
    if (requiredsize > outsize) {
4346
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4347
            requiredsize = 2*outsize;
4348
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4349
            goto onError;
4350
        }
4351
    }
4352
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4353
    *outpos += repwlen;
4354
    *endinpos = newpos;
4355
    *inptr = *input + newpos;
4356
4357
    /* we made it! */
4358
    Py_DECREF(restuple);
4359
    return 0;
4360
4361
  overflow:
4362
    PyErr_SetString(PyExc_OverflowError,
4363
                    "decoded result is too long for a Python string");
4364
4365
  onError:
4366
    Py_XDECREF(restuple);
4367
    return -1;
4368
}
4369
#endif   /* MS_WINDOWS */
4370
4371
static int
4372
unicode_decode_call_errorhandler_writer(
4373
    const char *errors, PyObject **errorHandler,
4374
    const char *encoding, const char *reason,
4375
    const char **input, const char **inend, Py_ssize_t *startinpos,
4376
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4377
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4378
2.48M
{
4379
2.48M
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4380
4381
2.48M
    PyObject *restuple = NULL;
4382
2.48M
    PyObject *repunicode = NULL;
4383
2.48M
    Py_ssize_t insize;
4384
2.48M
    Py_ssize_t newpos;
4385
2.48M
    Py_ssize_t replen;
4386
2.48M
    Py_ssize_t remain;
4387
2.48M
    PyObject *inputobj = NULL;
4388
2.48M
    int need_to_grow = 0;
4389
2.48M
    const char *new_inptr;
4390
4391
2.48M
    if (*errorHandler == NULL) {
4392
2.26M
        *errorHandler = PyCodec_LookupError(errors);
4393
2.26M
        if (*errorHandler == NULL)
4394
0
            goto onError;
4395
2.26M
    }
4396
4397
2.48M
    make_decode_exception(exceptionObject,
4398
2.48M
        encoding,
4399
2.48M
        *input, *inend - *input,
4400
2.48M
        *startinpos, *endinpos,
4401
2.48M
        reason);
4402
2.48M
    if (*exceptionObject == NULL)
4403
0
        goto onError;
4404
4405
2.48M
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4406
2.48M
    if (restuple == NULL)
4407
2.23M
        goto onError;
4408
252k
    if (!PyTuple_Check(restuple)) {
4409
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4410
0
        goto onError;
4411
0
    }
4412
252k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4413
0
        goto onError;
4414
4415
    /* Copy back the bytes variables, which might have been modified by the
4416
       callback */
4417
252k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4418
252k
    if (!inputobj)
4419
0
        goto onError;
4420
252k
    remain = *inend - *input - *endinpos;
4421
252k
    *input = PyBytes_AS_STRING(inputobj);
4422
252k
    insize = PyBytes_GET_SIZE(inputobj);
4423
252k
    *inend = *input + insize;
4424
    /* we can DECREF safely, as the exception has another reference,
4425
       so the object won't go away. */
4426
252k
    Py_DECREF(inputobj);
4427
4428
252k
    if (newpos<0)
4429
0
        newpos = insize+newpos;
4430
252k
    if (newpos<0 || newpos>insize) {
4431
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4432
0
        goto onError;
4433
0
    }
4434
4435
252k
    replen = PyUnicode_GET_LENGTH(repunicode);
4436
252k
    if (replen > 1) {
4437
26.5k
        writer->min_length += replen - 1;
4438
26.5k
        need_to_grow = 1;
4439
26.5k
    }
4440
252k
    new_inptr = *input + newpos;
4441
252k
    if (*inend - new_inptr > remain) {
4442
        /* We don't know the decoding algorithm here so we make the worst
4443
           assumption that one byte decodes to one unicode character.
4444
           If unfortunately one byte could decode to more unicode characters,
4445
           the decoder may write out-of-bound then.  Is it possible for the
4446
           algorithms using this function? */
4447
12.1k
        writer->min_length += *inend - new_inptr - remain;
4448
12.1k
        need_to_grow = 1;
4449
12.1k
    }
4450
252k
    if (need_to_grow) {
4451
26.7k
        writer->overallocate = 1;
4452
26.7k
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4453
26.7k
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4454
0
            goto onError;
4455
26.7k
    }
4456
252k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4457
0
        goto onError;
4458
4459
252k
    *endinpos = newpos;
4460
252k
    *inptr = new_inptr;
4461
4462
    /* we made it! */
4463
252k
    Py_DECREF(restuple);
4464
252k
    return 0;
4465
4466
2.23M
  onError:
4467
2.23M
    Py_XDECREF(restuple);
4468
2.23M
    return -1;
4469
252k
}
4470
4471
/* --- UTF-7 Codec -------------------------------------------------------- */
4472
4473
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4474
4475
/* Three simple macros defining base-64. */
4476
4477
/* Is c a base-64 character? */
4478
4479
#define IS_BASE64(c) \
4480
314k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4481
314k
     ((c) >= 'a' && (c) <= 'z') ||     \
4482
314k
     ((c) >= '0' && (c) <= '9') ||     \
4483
314k
     (c) == '+' || (c) == '/')
4484
4485
/* given that c is a base-64 character, what is its base-64 value? */
4486
4487
#define FROM_BASE64(c)                                                  \
4488
276k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4489
276k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4490
225k
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4491
137k
     (c) == '+' ? 62 : 63)
4492
4493
/* What is the base-64 character of the bottom 6 bits of n? */
4494
4495
#define TO_BASE64(n)  \
4496
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4497
4498
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4499
 * decoded as itself.  We are permissive on decoding; the only ASCII
4500
 * byte not decoding to itself is the + which begins a base64
4501
 * string. */
4502
4503
#define DECODE_DIRECT(c)                                \
4504
5.09M
    ((c) <= 127 && (c) != '+')
4505
4506
/* The UTF-7 encoder treats ASCII characters differently according to
4507
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4508
 * the above).  See RFC2152.  This array identifies these different
4509
 * sets:
4510
 * 0 : "Set D"
4511
 *     alphanumeric and '(),-./:?
4512
 * 1 : "Set O"
4513
 *     !"#$%&*;<=>@[]^_`{|}
4514
 * 2 : "whitespace"
4515
 *     ht nl cr sp
4516
 * 3 : special (must be base64 encoded)
4517
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4518
 */
4519
4520
static
4521
char utf7_category[128] = {
4522
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4523
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4524
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4525
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4526
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4527
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4528
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4529
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4530
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4531
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4532
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4533
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4534
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4535
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4536
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4537
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4538
};
4539
4540
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4541
 * answer depends on whether we are encoding set O as itself, and also
4542
 * on whether we are encoding whitespace as itself.  RFC 2152 makes it
4543
 * clear that the answers to these questions vary between
4544
 * applications, so this code needs to be flexible.  */
4545
4546
#define ENCODE_DIRECT(c) \
4547
0
    ((c) < 128 && (c) > 0 && ((utf7_category[(c)] != 3)))
4548
4549
PyObject *
4550
PyUnicode_DecodeUTF7(const char *s,
4551
                     Py_ssize_t size,
4552
                     const char *errors)
4553
0
{
4554
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4555
0
}
4556
4557
/* The decoder.  The only state we preserve is our read position,
4558
 * i.e. how many characters we have consumed.  So if we end in the
4559
 * middle of a shift sequence we have to back off the read position
4560
 * and the output to the beginning of the sequence, otherwise we lose
4561
 * all the shift state (seen bits, number of bits seen, high
4562
 * surrogate). */
4563
4564
PyObject *
4565
PyUnicode_DecodeUTF7Stateful(const char *s,
4566
                             Py_ssize_t size,
4567
                             const char *errors,
4568
                             Py_ssize_t *consumed)
4569
22.9k
{
4570
22.9k
    const char *starts = s;
4571
22.9k
    Py_ssize_t startinpos;
4572
22.9k
    Py_ssize_t endinpos;
4573
22.9k
    const char *e;
4574
22.9k
    _PyUnicodeWriter writer;
4575
22.9k
    const char *errmsg = "";
4576
22.9k
    int inShift = 0;
4577
22.9k
    Py_ssize_t shiftOutStart;
4578
22.9k
    unsigned int base64bits = 0;
4579
22.9k
    unsigned long base64buffer = 0;
4580
22.9k
    Py_UCS4 surrogate = 0;
4581
22.9k
    PyObject *errorHandler = NULL;
4582
22.9k
    PyObject *exc = NULL;
4583
4584
22.9k
    if (size == 0) {
4585
0
        if (consumed)
4586
0
            *consumed = 0;
4587
0
        _Py_RETURN_UNICODE_EMPTY();
4588
0
    }
4589
4590
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4591
22.9k
    _PyUnicodeWriter_Init(&writer);
4592
22.9k
    writer.min_length = size;
4593
4594
22.9k
    shiftOutStart = 0;
4595
22.9k
    e = s + size;
4596
4597
5.42M
    while (s < e) {
4598
5.40M
        Py_UCS4 ch;
4599
5.40M
      restart:
4600
5.40M
        ch = (unsigned char) *s;
4601
4602
5.40M
        if (inShift) { /* in a base-64 section */
4603
292k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4604
276k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4605
276k
                base64bits += 6;
4606
276k
                s++;
4607
276k
                if (base64bits >= 16) {
4608
                    /* we have enough bits for a UTF-16 value */
4609
98.3k
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4610
98.3k
                    base64bits -= 16;
4611
98.3k
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4612
98.3k
                    assert(outCh <= 0xffff);
4613
98.3k
                    if (surrogate) {
4614
                        /* expecting a second surrogate */
4615
8.16k
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4616
2.92k
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4617
2.92k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4618
0
                                goto onError;
4619
2.92k
                            surrogate = 0;
4620
2.92k
                            continue;
4621
2.92k
                        }
4622
5.23k
                        else {
4623
5.23k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4624
0
                                goto onError;
4625
5.23k
                            surrogate = 0;
4626
5.23k
                        }
4627
8.16k
                    }
4628
95.4k
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4629
                        /* first surrogate */
4630
11.2k
                        surrogate = outCh;
4631
11.2k
                    }
4632
84.1k
                    else {
4633
84.1k
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4634
0
                            goto onError;
4635
84.1k
                    }
4636
95.4k
                }
4637
276k
            }
4638
15.8k
            else { /* now leaving a base-64 section */
4639
15.8k
                inShift = 0;
4640
15.8k
                if (base64bits > 0) { /* left-over bits */
4641
12.6k
                    if (base64bits >= 6) {
4642
                        /* We've seen at least one base-64 character */
4643
6.01k
                        s++;
4644
6.01k
                        errmsg = "partial character in shift sequence";
4645
6.01k
                        goto utf7Error;
4646
6.01k
                    }
4647
6.62k
                    else {
4648
                        /* Some bits remain; they should be zero */
4649
6.62k
                        if (base64buffer != 0) {
4650
1.47k
                            s++;
4651
1.47k
                            errmsg = "non-zero padding bits in shift sequence";
4652
1.47k
                            goto utf7Error;
4653
1.47k
                        }
4654
6.62k
                    }
4655
12.6k
                }
4656
8.36k
                if (surrogate && DECODE_DIRECT(ch)) {
4657
2.36k
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4658
0
                        goto onError;
4659
2.36k
                }
4660
8.36k
                surrogate = 0;
4661
8.36k
                if (ch == '-') {
4662
                    /* '-' is absorbed; other terminating
4663
                       characters are preserved */
4664
2.18k
                    s++;
4665
2.18k
                }
4666
8.36k
            }
4667
292k
        }
4668
5.11M
        else if ( ch == '+' ) {
4669
24.3k
            startinpos = s-starts;
4670
24.3k
            s++; /* consume '+' */
4671
24.3k
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4672
2.36k
                s++;
4673
2.36k
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4674
0
                    goto onError;
4675
2.36k
            }
4676
21.9k
            else if (s < e && !IS_BASE64(*s)) {
4677
2.76k
                s++;
4678
2.76k
                errmsg = "ill-formed sequence";
4679
2.76k
                goto utf7Error;
4680
2.76k
            }
4681
19.1k
            else { /* begin base64-encoded section */
4682
19.1k
                inShift = 1;
4683
19.1k
                surrogate = 0;
4684
19.1k
                shiftOutStart = writer.pos;
4685
19.1k
                base64bits = 0;
4686
19.1k
                base64buffer = 0;
4687
19.1k
            }
4688
24.3k
        }
4689
5.09M
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4690
4.98M
            s++;
4691
4.98M
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4692
0
                goto onError;
4693
4.98M
        }
4694
104k
        else {
4695
104k
            startinpos = s-starts;
4696
104k
            s++;
4697
104k
            errmsg = "unexpected special character";
4698
104k
            goto utf7Error;
4699
104k
        }
4700
5.29M
        continue;
4701
5.29M
utf7Error:
4702
114k
        endinpos = s-starts;
4703
114k
        if (unicode_decode_call_errorhandler_writer(
4704
114k
                errors, &errorHandler,
4705
114k
                "utf7", errmsg,
4706
114k
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4707
114k
                &writer))
4708
10.2k
            goto onError;
4709
114k
    }
4710
4711
    /* end of string */
4712
4713
12.6k
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4714
        /* if we're in an inconsistent state, that's an error */
4715
3.32k
        inShift = 0;
4716
3.32k
        if (surrogate ||
4717
3.00k
                (base64bits >= 6) ||
4718
1.91k
                (base64bits > 0 && base64buffer != 0)) {
4719
1.91k
            endinpos = size;
4720
1.91k
            if (unicode_decode_call_errorhandler_writer(
4721
1.91k
                    errors, &errorHandler,
4722
1.91k
                    "utf7", "unterminated shift sequence",
4723
1.91k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4724
1.91k
                    &writer))
4725
1.55k
                goto onError;
4726
363
            if (s < e)
4727
0
                goto restart;
4728
363
        }
4729
3.32k
    }
4730
4731
    /* return state */
4732
11.1k
    if (consumed) {
4733
0
        if (inShift) {
4734
0
            *consumed = startinpos;
4735
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4736
0
                PyObject *result = PyUnicode_FromKindAndData(
4737
0
                        writer.kind, writer.data, shiftOutStart);
4738
0
                Py_XDECREF(errorHandler);
4739
0
                Py_XDECREF(exc);
4740
0
                _PyUnicodeWriter_Dealloc(&writer);
4741
0
                return result;
4742
0
            }
4743
0
            writer.pos = shiftOutStart; /* back off output */
4744
0
        }
4745
0
        else {
4746
0
            *consumed = s-starts;
4747
0
        }
4748
0
    }
4749
4750
11.1k
    Py_XDECREF(errorHandler);
4751
11.1k
    Py_XDECREF(exc);
4752
11.1k
    return _PyUnicodeWriter_Finish(&writer);
4753
4754
11.8k
  onError:
4755
11.8k
    Py_XDECREF(errorHandler);
4756
11.8k
    Py_XDECREF(exc);
4757
11.8k
    _PyUnicodeWriter_Dealloc(&writer);
4758
11.8k
    return NULL;
4759
11.1k
}
4760
4761
4762
PyObject *
4763
_PyUnicode_EncodeUTF7(PyObject *str,
4764
                      const char *errors)
4765
0
{
4766
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
4767
0
    if (len == 0) {
4768
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
4769
0
    }
4770
0
    int kind = PyUnicode_KIND(str);
4771
0
    const void *data = PyUnicode_DATA(str);
4772
4773
    /* It might be possible to tighten this worst case */
4774
0
    if (len > PY_SSIZE_T_MAX / 8) {
4775
0
        return PyErr_NoMemory();
4776
0
    }
4777
0
    PyBytesWriter *writer = PyBytesWriter_Create(len * 8);
4778
0
    if (writer == NULL) {
4779
0
        return NULL;
4780
0
    }
4781
4782
0
    int inShift = 0;
4783
0
    unsigned int base64bits = 0;
4784
0
    unsigned long base64buffer = 0;
4785
0
    char *out = PyBytesWriter_GetData(writer);
4786
0
    for (Py_ssize_t i = 0; i < len; ++i) {
4787
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4788
4789
0
        if (inShift) {
4790
0
            if (ENCODE_DIRECT(ch)) {
4791
                /* shifting out */
4792
0
                if (base64bits) { /* output remaining bits */
4793
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4794
0
                    base64buffer = 0;
4795
0
                    base64bits = 0;
4796
0
                }
4797
0
                inShift = 0;
4798
                /* Characters not in the BASE64 set implicitly unshift the sequence
4799
                   so no '-' is required, except if the character is itself a '-' */
4800
0
                if (IS_BASE64(ch) || ch == '-') {
4801
0
                    *out++ = '-';
4802
0
                }
4803
0
                *out++ = (char) ch;
4804
0
            }
4805
0
            else {
4806
0
                goto encode_char;
4807
0
            }
4808
0
        }
4809
0
        else { /* not in a shift sequence */
4810
0
            if (ch == '+') {
4811
0
                *out++ = '+';
4812
0
                        *out++ = '-';
4813
0
            }
4814
0
            else if (ENCODE_DIRECT(ch)) {
4815
0
                *out++ = (char) ch;
4816
0
            }
4817
0
            else {
4818
0
                *out++ = '+';
4819
0
                inShift = 1;
4820
0
                goto encode_char;
4821
0
            }
4822
0
        }
4823
0
        continue;
4824
0
encode_char:
4825
0
        if (ch >= 0x10000) {
4826
0
            assert(ch <= MAX_UNICODE);
4827
4828
            /* code first surrogate */
4829
0
            base64bits += 16;
4830
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4831
0
            while (base64bits >= 6) {
4832
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4833
0
                base64bits -= 6;
4834
0
            }
4835
            /* prepare second surrogate */
4836
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4837
0
        }
4838
0
        base64bits += 16;
4839
0
        base64buffer = (base64buffer << 16) | ch;
4840
0
        while (base64bits >= 6) {
4841
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4842
0
            base64bits -= 6;
4843
0
        }
4844
0
    }
4845
0
    if (base64bits)
4846
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4847
0
    if (inShift)
4848
0
        *out++ = '-';
4849
0
    return PyBytesWriter_FinishWithPointer(writer, out);
4850
0
}
4851
4852
#undef IS_BASE64
4853
#undef FROM_BASE64
4854
#undef TO_BASE64
4855
#undef DECODE_DIRECT
4856
#undef ENCODE_DIRECT
4857
4858
/* --- UTF-8 Codec -------------------------------------------------------- */
4859
4860
PyObject *
4861
PyUnicode_DecodeUTF8(const char *s,
4862
                     Py_ssize_t size,
4863
                     const char *errors)
4864
69.6M
{
4865
69.6M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4866
69.6M
}
4867
4868
#include "stringlib/asciilib.h"
4869
#include "stringlib/codecs.h"
4870
#include "stringlib/undef.h"
4871
4872
#include "stringlib/ucs1lib.h"
4873
#include "stringlib/codecs.h"
4874
#include "stringlib/undef.h"
4875
4876
#include "stringlib/ucs2lib.h"
4877
#include "stringlib/codecs.h"
4878
#include "stringlib/undef.h"
4879
4880
#include "stringlib/ucs4lib.h"
4881
#include "stringlib/codecs.h"
4882
#include "stringlib/undef.h"
4883
4884
#if (SIZEOF_SIZE_T == 8)
4885
/* Mask to quickly check whether a C 'size_t' contains a
4886
   non-ASCII, UTF8-encoded char. */
4887
184M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4888
// used to count codepoints in UTF-8 string.
4889
53.4M
# define VECTOR_0101     0x0101010101010101ULL
4890
929k
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
4891
#elif (SIZEOF_SIZE_T == 4)
4892
# define ASCII_CHAR_MASK 0x80808080U
4893
# define VECTOR_0101     0x01010101U
4894
# define VECTOR_00FF     0x00ff00ffU
4895
#else
4896
# error C 'size_t' size should be either 4 or 8!
4897
#endif
4898
4899
#if (defined(__clang__) || defined(__GNUC__))
4900
#define HAVE_CTZ 1
4901
static inline unsigned int
4902
ctz(size_t v)
4903
15.9M
{
4904
15.9M
    return __builtin_ctzll((unsigned long long)v);
4905
15.9M
}
4906
#elif defined(_MSC_VER)
4907
#define HAVE_CTZ 1
4908
static inline unsigned int
4909
ctz(size_t v)
4910
{
4911
    unsigned long pos;
4912
#if SIZEOF_SIZE_T == 4
4913
    _BitScanForward(&pos, v);
4914
#else
4915
    _BitScanForward64(&pos, v);
4916
#endif /* SIZEOF_SIZE_T */
4917
    return pos;
4918
}
4919
#else
4920
#define HAVE_CTZ 0
4921
#endif
4922
4923
#if HAVE_CTZ && PY_LITTLE_ENDIAN
4924
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
4925
static size_t
4926
load_unaligned(const unsigned char *p, size_t size)
4927
58.9M
{
4928
58.9M
    union {
4929
58.9M
        size_t s;
4930
58.9M
        unsigned char b[SIZEOF_SIZE_T];
4931
58.9M
    } u;
4932
58.9M
    u.s = 0;
4933
    // This switch statement assumes little endian because:
4934
    // * union is faster than bitwise or and shift.
4935
    // * big endian machine is rare and hard to maintain.
4936
58.9M
    switch (size) {
4937
0
    default:
4938
0
#if SIZEOF_SIZE_T == 8
4939
0
    case 8:
4940
0
        u.b[7] = p[7];
4941
0
        _Py_FALLTHROUGH;
4942
4.02M
    case 7:
4943
4.02M
        u.b[6] = p[6];
4944
4.02M
        _Py_FALLTHROUGH;
4945
10.2M
    case 6:
4946
10.2M
        u.b[5] = p[5];
4947
10.2M
        _Py_FALLTHROUGH;
4948
17.6M
    case 5:
4949
17.6M
        u.b[4] = p[4];
4950
17.6M
        _Py_FALLTHROUGH;
4951
17.6M
#endif
4952
22.7M
    case 4:
4953
22.7M
        u.b[3] = p[3];
4954
22.7M
        _Py_FALLTHROUGH;
4955
38.6M
    case 3:
4956
38.6M
        u.b[2] = p[2];
4957
38.6M
        _Py_FALLTHROUGH;
4958
50.7M
    case 2:
4959
50.7M
        u.b[1] = p[1];
4960
50.7M
        _Py_FALLTHROUGH;
4961
52.9M
    case 1:
4962
52.9M
        u.b[0] = p[0];
4963
52.9M
        break;
4964
6.03M
    case 0:
4965
6.03M
        break;
4966
58.9M
    }
4967
58.9M
    return u.s;
4968
58.9M
}
4969
#endif
4970
4971
/*
4972
 * Find the first non-ASCII character in a byte sequence.
4973
 *
4974
 * This function scans a range of bytes from `start` to `end` and returns the
4975
 * index of the first byte that is not an ASCII character (i.e., has the most
4976
 * significant bit set). If all characters in the range are ASCII, it returns
4977
 * `end - start`.
4978
 */
4979
static Py_ssize_t
4980
find_first_nonascii(const unsigned char *start, const unsigned char *end)
4981
66.8M
{
4982
    // The search is done in `size_t` chunks.
4983
    // The start and end might not be aligned at `size_t` boundaries,
4984
    // so they're handled specially.
4985
4986
66.8M
    const unsigned char *p = start;
4987
4988
66.8M
    if (end - start >= SIZEOF_SIZE_T) {
4989
        // Avoid unaligned read.
4990
23.4M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
4991
23.4M
        size_t u;
4992
23.4M
        memcpy(&u, p, sizeof(size_t));
4993
23.4M
        u &= ASCII_CHAR_MASK;
4994
23.4M
        if (u) {
4995
6.35M
            return (ctz(u) - 7) / 8;
4996
6.35M
        }
4997
17.0M
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
4998
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
4999
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
5000
        while (p < p2) {
5001
            if (*p & 0x80) {
5002
                return p - start;
5003
            }
5004
            p++;
5005
        }
5006
#endif
5007
5008
17.0M
        const unsigned char *e = end - SIZEOF_SIZE_T;
5009
110M
        while (p <= e) {
5010
94.9M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5011
94.9M
            if (u) {
5012
1.58M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5013
1.58M
                return p - start + (ctz(u) - 7) / 8;
5014
#else
5015
                // big endian and minor compilers are difficult to test.
5016
                // fallback to per byte check.
5017
                break;
5018
#endif
5019
1.58M
            }
5020
93.3M
            p += SIZEOF_SIZE_T;
5021
93.3M
        }
5022
17.0M
    }
5023
58.9M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5024
66.8M
    assert((end - p) < SIZEOF_SIZE_T);
5025
    // we can not use *(const size_t*)p to avoid buffer overrun.
5026
58.9M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5027
58.9M
    if (u) {
5028
7.98M
        return p - start + (ctz(u) - 7) / 8;
5029
7.98M
    }
5030
50.9M
    return end - start;
5031
#else
5032
    while (p < end) {
5033
        if (*p & 0x80) {
5034
            break;
5035
        }
5036
        p++;
5037
    }
5038
    return p - start;
5039
#endif
5040
58.9M
}
5041
5042
static inline int
5043
scalar_utf8_start_char(unsigned int ch)
5044
879k
{
5045
    // 0xxxxxxx or 11xxxxxx are first byte.
5046
879k
    return (~ch >> 7 | ch >> 6) & 1;
5047
879k
}
5048
5049
static inline size_t
5050
vector_utf8_start_chars(size_t v)
5051
53.4M
{
5052
53.4M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5053
53.4M
}
5054
5055
5056
// Count the number of UTF-8 code points in a given byte sequence.
5057
static Py_ssize_t
5058
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5059
349k
{
5060
349k
    Py_ssize_t len = 0;
5061
5062
349k
    if (end - s >= SIZEOF_SIZE_T) {
5063
280k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5064
16.3k
            len += scalar_utf8_start_char(*s++);
5065
16.3k
        }
5066
5067
728k
        while (s + SIZEOF_SIZE_T <= end) {
5068
464k
            const unsigned char *e = end;
5069
464k
            if (e - s > SIZEOF_SIZE_T * 255) {
5070
202k
                e = s + SIZEOF_SIZE_T * 255;
5071
202k
            }
5072
464k
            Py_ssize_t vstart = 0;
5073
53.9M
            while (s + SIZEOF_SIZE_T <= e) {
5074
53.4M
                size_t v = *(size_t*)s;
5075
53.4M
                size_t vs = vector_utf8_start_chars(v);
5076
53.4M
                vstart += vs;
5077
53.4M
                s += SIZEOF_SIZE_T;
5078
53.4M
            }
5079
464k
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5080
464k
            vstart += vstart >> 16;
5081
464k
#if SIZEOF_SIZE_T == 8
5082
464k
            vstart += vstart >> 32;
5083
464k
#endif
5084
464k
            len += vstart & 0x7ff;
5085
464k
        }
5086
263k
    }
5087
1.21M
    while (s < end) {
5088
863k
        len += scalar_utf8_start_char(*s++);
5089
863k
    }
5090
349k
    return len;
5091
349k
}
5092
5093
static Py_ssize_t
5094
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5095
14.4M
{
5096
14.4M
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5097
14.4M
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5098
14.3M
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5099
11.2M
    {
5100
        /* Fast path, see in STRINGLIB(utf8_decode) for
5101
           an explanation. */
5102
11.2M
        const char *p = start;
5103
11.2M
        Py_UCS1 *q = dest;
5104
16.2M
        while (p + SIZEOF_SIZE_T <= end) {
5105
7.30M
            size_t value = *(const size_t *) p;
5106
7.30M
            if (value & ASCII_CHAR_MASK)
5107
2.31M
                break;
5108
4.98M
            *((size_t *)q) = value;
5109
4.98M
            p += SIZEOF_SIZE_T;
5110
4.98M
            q += SIZEOF_SIZE_T;
5111
4.98M
        }
5112
52.2M
        while (p < end) {
5113
43.3M
            if ((unsigned char)*p & 0x80)
5114
2.33M
                break;
5115
41.0M
            *q++ = *p++;
5116
41.0M
        }
5117
11.2M
        return p - start;
5118
11.2M
    }
5119
3.19M
#endif
5120
3.19M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5121
3.19M
                                         (const unsigned char*)end);
5122
3.19M
    memcpy(dest, start, pos);
5123
3.19M
    return pos;
5124
14.4M
}
5125
5126
static int
5127
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5128
                         const char *starts, const char *s, const char *end,
5129
                         _Py_error_handler error_handler,
5130
                         const char *errors,
5131
                         Py_ssize_t *consumed)
5132
15.9M
{
5133
15.9M
    Py_ssize_t startinpos, endinpos;
5134
15.9M
    const char *errmsg = "";
5135
15.9M
    PyObject *error_handler_obj = NULL;
5136
15.9M
    PyObject *exc = NULL;
5137
5138
368M
    while (s < end) {
5139
362M
        Py_UCS4 ch;
5140
362M
        int kind = writer->kind;
5141
5142
362M
        if (kind == PyUnicode_1BYTE_KIND) {
5143
16.2M
            if (PyUnicode_IS_ASCII(writer->buffer))
5144
15.5M
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5145
643k
            else
5146
643k
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5147
345M
        } else if (kind == PyUnicode_2BYTE_KIND) {
5148
122M
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5149
223M
        } else {
5150
223M
            assert(kind == PyUnicode_4BYTE_KIND);
5151
223M
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5152
223M
        }
5153
5154
362M
        switch (ch) {
5155
9.66M
        case 0:
5156
9.66M
            if (s == end || consumed)
5157
9.64M
                goto End;
5158
24.7k
            errmsg = "unexpected end of data";
5159
24.7k
            startinpos = s - starts;
5160
24.7k
            endinpos = end - starts;
5161
24.7k
            break;
5162
255M
        case 1:
5163
255M
            errmsg = "invalid start byte";
5164
255M
            startinpos = s - starts;
5165
255M
            endinpos = startinpos + 1;
5166
255M
            break;
5167
79.3M
        case 2:
5168
79.3M
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5169
0
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5170
0
            {
5171
                /* Truncated surrogate code in range D800-DFFF */
5172
0
                goto End;
5173
0
            }
5174
79.3M
            _Py_FALLTHROUGH;
5175
80.7M
        case 3:
5176
81.0M
        case 4:
5177
81.0M
            errmsg = "invalid continuation byte";
5178
81.0M
            startinpos = s - starts;
5179
81.0M
            endinpos = startinpos + ch - 1;
5180
81.0M
            break;
5181
15.6M
        default:
5182
            // ch doesn't fit into kind, so change the buffer kind to write
5183
            // the character
5184
15.6M
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5185
0
                goto onError;
5186
15.6M
            continue;
5187
362M
        }
5188
5189
336M
        if (error_handler == _Py_ERROR_UNKNOWN)
5190
165k
            error_handler = _Py_GetErrorHandler(errors);
5191
5192
336M
        switch (error_handler) {
5193
0
        case _Py_ERROR_IGNORE:
5194
0
            s += (endinpos - startinpos);
5195
0
            break;
5196
5197
334M
        case _Py_ERROR_REPLACE:
5198
334M
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5199
0
                goto onError;
5200
334M
            s += (endinpos - startinpos);
5201
334M
            break;
5202
5203
2.78M
        case _Py_ERROR_SURROGATEESCAPE:
5204
2.78M
        {
5205
2.78M
            Py_ssize_t i;
5206
5207
2.78M
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5208
0
                goto onError;
5209
5.56M
            for (i=startinpos; i<endinpos; i++) {
5210
2.78M
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5211
2.78M
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5212
2.78M
                                ch + 0xdc00);
5213
2.78M
                writer->pos++;
5214
2.78M
            }
5215
2.78M
            s += (endinpos - startinpos);
5216
2.78M
            break;
5217
2.78M
        }
5218
5219
1.62k
        default:
5220
1.62k
            if (unicode_decode_call_errorhandler_writer(
5221
1.62k
                    errors, &error_handler_obj,
5222
1.62k
                    "utf-8", errmsg,
5223
1.62k
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5224
1.62k
                    writer)) {
5225
1.61k
                goto onError;
5226
1.61k
            }
5227
5228
8
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5229
0
                goto onError;
5230
0
            }
5231
336M
        }
5232
336M
    }
5233
5234
15.9M
End:
5235
15.9M
    if (consumed)
5236
659
        *consumed = s - starts;
5237
5238
15.9M
    Py_XDECREF(error_handler_obj);
5239
15.9M
    Py_XDECREF(exc);
5240
15.9M
    return 0;
5241
5242
1.61k
onError:
5243
1.61k
    Py_XDECREF(error_handler_obj);
5244
1.61k
    Py_XDECREF(exc);
5245
1.61k
    return -1;
5246
15.9M
}
5247
5248
5249
static PyObject *
5250
unicode_decode_utf8(const char *s, Py_ssize_t size,
5251
                    _Py_error_handler error_handler, const char *errors,
5252
                    Py_ssize_t *consumed)
5253
102M
{
5254
102M
    if (size == 0) {
5255
2.51M
        if (consumed) {
5256
0
            *consumed = 0;
5257
0
        }
5258
2.51M
        _Py_RETURN_UNICODE_EMPTY();
5259
2.51M
    }
5260
5261
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5262
99.7M
    if (size == 1 && (unsigned char)s[0] < 128) {
5263
36.0M
        if (consumed) {
5264
0
            *consumed = 1;
5265
0
        }
5266
36.0M
        return get_latin1_char((unsigned char)s[0]);
5267
36.0M
    }
5268
5269
    // I don't know this check is necessary or not. But there is a test
5270
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5271
63.6M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5272
0
        PyErr_NoMemory();
5273
0
        return NULL;
5274
0
    }
5275
5276
63.6M
    const char *starts = s;
5277
63.6M
    const char *end = s + size;
5278
5279
63.6M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5280
63.6M
    if (pos == size) {  // fast path: ASCII string.
5281
47.7M
        PyObject *u = PyUnicode_New(size, 127);
5282
47.7M
        if (u == NULL) {
5283
0
            return NULL;
5284
0
        }
5285
47.7M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5286
47.7M
        if (consumed) {
5287
104
            *consumed = size;
5288
104
        }
5289
47.7M
        return u;
5290
47.7M
    }
5291
5292
15.8M
    int maxchr = 127;
5293
15.8M
    Py_ssize_t maxsize = size;
5294
5295
15.8M
    unsigned char ch = (unsigned char)(s[pos]);
5296
    // error handler other than strict may remove/replace the invalid byte.
5297
    // consumed != NULL allows 1~3 bytes remainings.
5298
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5299
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5300
    // reallocation and copy.
5301
15.8M
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5302
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5303
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5304
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5305
        // means that it is no longer necessary to allocate several times the required amount
5306
        // of memory.
5307
349k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5308
349k
        if (ch < 0xc4) { // latin1
5309
226k
            maxchr = 0xff;
5310
226k
        }
5311
122k
        else if (ch < 0xf0) { // ucs2
5312
109k
            maxchr = 0xffff;
5313
109k
        }
5314
13.4k
        else { // ucs4
5315
13.4k
            maxchr = 0x10ffff;
5316
13.4k
        }
5317
349k
    }
5318
15.8M
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5319
15.8M
    if (!u) {
5320
0
        return NULL;
5321
0
    }
5322
5323
    // Use _PyUnicodeWriter after fast path is failed.
5324
15.8M
    _PyUnicodeWriter writer;
5325
15.8M
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5326
15.8M
    if (maxchr <= 255) {
5327
15.7M
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5328
15.7M
        s += pos;
5329
15.7M
        writer.pos = pos;
5330
15.7M
    }
5331
5332
15.8M
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5333
15.8M
                                 error_handler, errors,
5334
15.8M
                                 consumed) < 0) {
5335
1.61k
        _PyUnicodeWriter_Dealloc(&writer);
5336
1.61k
        return NULL;
5337
1.61k
    }
5338
15.8M
    return _PyUnicodeWriter_Finish(&writer);
5339
15.8M
}
5340
5341
5342
// Used by PyUnicodeWriter_WriteUTF8() implementation
5343
int
5344
_PyUnicode_DecodeUTF8Writer(_PyUnicodeWriter *writer,
5345
                            const char *s, Py_ssize_t size,
5346
                            _Py_error_handler error_handler, const char *errors,
5347
                            Py_ssize_t *consumed)
5348
3.27M
{
5349
3.27M
    if (size == 0) {
5350
8.58k
        if (consumed) {
5351
0
            *consumed = 0;
5352
0
        }
5353
8.58k
        return 0;
5354
8.58k
    }
5355
5356
    // fast path: try ASCII string.
5357
3.27M
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5358
0
        return -1;
5359
0
    }
5360
5361
3.27M
    const char *starts = s;
5362
3.27M
    const char *end = s + size;
5363
3.27M
    Py_ssize_t decoded = 0;
5364
3.27M
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5365
3.27M
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5366
3.26M
        decoded = ascii_decode(s, end, dest);
5367
3.26M
        writer->pos += decoded;
5368
5369
3.26M
        if (decoded == size) {
5370
3.23M
            if (consumed) {
5371
1.26k
                *consumed = size;
5372
1.26k
            }
5373
3.23M
            return 0;
5374
3.23M
        }
5375
36.0k
        s += decoded;
5376
36.0k
    }
5377
5378
37.8k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5379
37.8k
                                    error_handler, errors, consumed);
5380
3.27M
}
5381
5382
5383
PyObject *
5384
PyUnicode_DecodeUTF8Stateful(const char *s,
5385
                             Py_ssize_t size,
5386
                             const char *errors,
5387
                             Py_ssize_t *consumed)
5388
102M
{
5389
102M
    return unicode_decode_utf8(s, size,
5390
102M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5391
102M
                               errors, consumed);
5392
102M
}
5393
5394
5395
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5396
   non-zero, use strict error handler otherwise.
5397
5398
   On success, write a pointer to a newly allocated wide character string into
5399
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5400
   (in number of wchar_t units) into *wlen (if wlen is set).
5401
5402
   On memory allocation failure, return -1.
5403
5404
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5405
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5406
   is not NULL, write the decoding error message into *reason. */
5407
int
5408
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5409
                 const char **reason, _Py_error_handler errors)
5410
296
{
5411
296
    const char *orig_s = s;
5412
296
    const char *e;
5413
296
    wchar_t *unicode;
5414
296
    Py_ssize_t outpos;
5415
5416
296
    int surrogateescape = 0;
5417
296
    int surrogatepass = 0;
5418
296
    switch (errors)
5419
296
    {
5420
0
    case _Py_ERROR_STRICT:
5421
0
        break;
5422
296
    case _Py_ERROR_SURROGATEESCAPE:
5423
296
        surrogateescape = 1;
5424
296
        break;
5425
0
    case _Py_ERROR_SURROGATEPASS:
5426
0
        surrogatepass = 1;
5427
0
        break;
5428
0
    default:
5429
0
        return -3;
5430
296
    }
5431
5432
    /* Note: size will always be longer than the resulting Unicode
5433
       character count */
5434
296
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5435
0
        return -1;
5436
0
    }
5437
5438
296
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5439
296
    if (!unicode) {
5440
0
        return -1;
5441
0
    }
5442
5443
    /* Unpack UTF-8 encoded data */
5444
296
    e = s + size;
5445
296
    outpos = 0;
5446
296
    while (s < e) {
5447
296
        Py_UCS4 ch;
5448
296
#if SIZEOF_WCHAR_T == 4
5449
296
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5450
#else
5451
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5452
#endif
5453
296
        if (ch > 0xFF) {
5454
0
#if SIZEOF_WCHAR_T == 4
5455
0
            Py_UNREACHABLE();
5456
#else
5457
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5458
            /* write a surrogate pair */
5459
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5460
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5461
#endif
5462
0
        }
5463
296
        else {
5464
296
            if (!ch && s == e) {
5465
296
                break;
5466
296
            }
5467
5468
0
            if (surrogateescape) {
5469
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5470
0
            }
5471
0
            else {
5472
                /* Is it a valid three-byte code? */
5473
0
                if (surrogatepass
5474
0
                    && (e - s) >= 3
5475
0
                    && (s[0] & 0xf0) == 0xe0
5476
0
                    && (s[1] & 0xc0) == 0x80
5477
0
                    && (s[2] & 0xc0) == 0x80)
5478
0
                {
5479
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5480
0
                    s += 3;
5481
0
                    unicode[outpos++] = ch;
5482
0
                }
5483
0
                else {
5484
0
                    PyMem_RawFree(unicode );
5485
0
                    if (reason != NULL) {
5486
0
                        switch (ch) {
5487
0
                        case 0:
5488
0
                            *reason = "unexpected end of data";
5489
0
                            break;
5490
0
                        case 1:
5491
0
                            *reason = "invalid start byte";
5492
0
                            break;
5493
                        /* 2, 3, 4 */
5494
0
                        default:
5495
0
                            *reason = "invalid continuation byte";
5496
0
                            break;
5497
0
                        }
5498
0
                    }
5499
0
                    if (wlen != NULL) {
5500
0
                        *wlen = s - orig_s;
5501
0
                    }
5502
0
                    return -2;
5503
0
                }
5504
0
            }
5505
0
        }
5506
296
    }
5507
296
    unicode[outpos] = L'\0';
5508
296
    if (wlen) {
5509
296
        *wlen = outpos;
5510
296
    }
5511
296
    *wstr = unicode;
5512
296
    return 0;
5513
296
}
5514
5515
5516
wchar_t*
5517
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5518
                               size_t *wlen)
5519
0
{
5520
0
    wchar_t *wstr;
5521
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5522
0
                               &wstr, wlen,
5523
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5524
0
    if (res != 0) {
5525
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5526
0
        assert(res != -3);
5527
0
        if (wlen) {
5528
0
            *wlen = (size_t)res;
5529
0
        }
5530
0
        return NULL;
5531
0
    }
5532
0
    return wstr;
5533
0
}
5534
5535
5536
/* UTF-8 encoder.
5537
5538
   On success, return 0 and write the newly allocated character string (use
5539
   PyMem_Free() to free the memory) into *str.
5540
5541
   On encoding failure, return -2 and write the position of the invalid
5542
   surrogate character into *error_pos (if error_pos is set) and the decoding
5543
   error message into *reason (if reason is set).
5544
5545
   On memory allocation failure, return -1. */
5546
int
5547
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5548
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5549
629
{
5550
629
    const Py_ssize_t max_char_size = 4;
5551
629
    Py_ssize_t len = wcslen(text);
5552
5553
629
    assert(len >= 0);
5554
5555
629
    int surrogateescape = 0;
5556
629
    int surrogatepass = 0;
5557
629
    switch (errors)
5558
629
    {
5559
148
    case _Py_ERROR_STRICT:
5560
148
        break;
5561
481
    case _Py_ERROR_SURROGATEESCAPE:
5562
481
        surrogateescape = 1;
5563
481
        break;
5564
0
    case _Py_ERROR_SURROGATEPASS:
5565
0
        surrogatepass = 1;
5566
0
        break;
5567
0
    default:
5568
0
        return -3;
5569
629
    }
5570
5571
629
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5572
0
        return -1;
5573
0
    }
5574
629
    char *bytes;
5575
629
    if (raw_malloc) {
5576
629
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5577
629
    }
5578
0
    else {
5579
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5580
0
    }
5581
629
    if (bytes == NULL) {
5582
0
        return -1;
5583
0
    }
5584
5585
629
    char *p = bytes;
5586
629
    Py_ssize_t i;
5587
19.7k
    for (i = 0; i < len; ) {
5588
19.0k
        Py_ssize_t ch_pos = i;
5589
19.0k
        Py_UCS4 ch = text[i];
5590
19.0k
        i++;
5591
19.0k
        if (sizeof(wchar_t) == 2
5592
0
            && Py_UNICODE_IS_HIGH_SURROGATE(ch)
5593
0
            && i < len
5594
0
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5595
0
        {
5596
0
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5597
0
            i++;
5598
0
        }
5599
5600
19.0k
        if (ch < 0x80) {
5601
            /* Encode ASCII */
5602
19.0k
            *p++ = (char) ch;
5603
5604
19.0k
        }
5605
0
        else if (ch < 0x0800) {
5606
            /* Encode Latin-1 */
5607
0
            *p++ = (char)(0xc0 | (ch >> 6));
5608
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5609
0
        }
5610
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5611
            /* surrogateescape error handler */
5612
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5613
0
                if (error_pos != NULL) {
5614
0
                    *error_pos = (size_t)ch_pos;
5615
0
                }
5616
0
                if (reason != NULL) {
5617
0
                    *reason = "encoding error";
5618
0
                }
5619
0
                if (raw_malloc) {
5620
0
                    PyMem_RawFree(bytes);
5621
0
                }
5622
0
                else {
5623
0
                    PyMem_Free(bytes);
5624
0
                }
5625
0
                return -2;
5626
0
            }
5627
0
            *p++ = (char)(ch & 0xff);
5628
0
        }
5629
0
        else if (ch < 0x10000) {
5630
0
            *p++ = (char)(0xe0 | (ch >> 12));
5631
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5632
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5633
0
        }
5634
0
        else {  /* ch >= 0x10000 */
5635
0
            assert(ch <= MAX_UNICODE);
5636
            /* Encode UCS4 Unicode ordinals */
5637
0
            *p++ = (char)(0xf0 | (ch >> 18));
5638
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5639
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5640
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5641
0
        }
5642
19.0k
    }
5643
629
    *p++ = '\0';
5644
5645
629
    size_t final_size = (p - bytes);
5646
629
    char *bytes2;
5647
629
    if (raw_malloc) {
5648
629
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5649
629
    }
5650
0
    else {
5651
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5652
0
    }
5653
629
    if (bytes2 == NULL) {
5654
0
        if (error_pos != NULL) {
5655
0
            *error_pos = (size_t)-1;
5656
0
        }
5657
0
        if (raw_malloc) {
5658
0
            PyMem_RawFree(bytes);
5659
0
        }
5660
0
        else {
5661
0
            PyMem_Free(bytes);
5662
0
        }
5663
0
        return -1;
5664
0
    }
5665
629
    *str = bytes2;
5666
629
    return 0;
5667
629
}
5668
5669
5670
/* Primary internal function which creates utf8 encoded bytes objects.
5671
5672
   Allocation strategy:  if the string is short, convert into a stack buffer
5673
   and allocate exactly as much space needed at the end.  Else allocate the
5674
   maximum possible needed (4 result bytes per Unicode character), and return
5675
   the excess memory at the end.
5676
*/
5677
static PyObject *
5678
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5679
                    const char *errors)
5680
17.8M
{
5681
17.8M
    if (!PyUnicode_Check(unicode)) {
5682
0
        PyErr_BadArgument();
5683
0
        return NULL;
5684
0
    }
5685
5686
17.8M
    if (PyUnicode_UTF8(unicode))
5687
9.18M
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5688
9.18M
                                         PyUnicode_UTF8_LENGTH(unicode));
5689
5690
8.66M
    int kind = PyUnicode_KIND(unicode);
5691
8.66M
    const void *data = PyUnicode_DATA(unicode);
5692
8.66M
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5693
5694
8.66M
    PyBytesWriter *writer;
5695
8.66M
    char *end;
5696
5697
8.66M
    switch (kind) {
5698
0
    default:
5699
0
        Py_UNREACHABLE();
5700
5.80M
    case PyUnicode_1BYTE_KIND:
5701
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5702
5.80M
        assert(!PyUnicode_IS_ASCII(unicode));
5703
5.80M
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5704
5.80M
                                      error_handler, errors, &end);
5705
5.80M
        break;
5706
1.65M
    case PyUnicode_2BYTE_KIND:
5707
1.65M
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5708
1.65M
                                      error_handler, errors, &end);
5709
1.65M
        break;
5710
1.20M
    case PyUnicode_4BYTE_KIND:
5711
1.20M
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5712
1.20M
                                      error_handler, errors, &end);
5713
1.20M
        break;
5714
8.66M
    }
5715
5716
8.66M
    if (writer == NULL) {
5717
148k
        PyBytesWriter_Discard(writer);
5718
148k
        return NULL;
5719
148k
    }
5720
8.51M
    return PyBytesWriter_FinishWithPointer(writer, end);
5721
8.66M
}
5722
5723
static int
5724
unicode_fill_utf8(PyObject *unicode)
5725
152k
{
5726
152k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5727
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5728
152k
    assert(!PyUnicode_IS_ASCII(unicode));
5729
5730
152k
    int kind = PyUnicode_KIND(unicode);
5731
152k
    const void *data = PyUnicode_DATA(unicode);
5732
152k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5733
5734
152k
    PyBytesWriter *writer;
5735
152k
    char *end;
5736
5737
152k
    switch (kind) {
5738
0
    default:
5739
0
        Py_UNREACHABLE();
5740
114k
    case PyUnicode_1BYTE_KIND:
5741
114k
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5742
114k
                                      _Py_ERROR_STRICT, NULL, &end);
5743
114k
        break;
5744
31.6k
    case PyUnicode_2BYTE_KIND:
5745
31.6k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5746
31.6k
                                      _Py_ERROR_STRICT, NULL, &end);
5747
31.6k
        break;
5748
6.65k
    case PyUnicode_4BYTE_KIND:
5749
6.65k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5750
6.65k
                                      _Py_ERROR_STRICT, NULL, &end);
5751
6.65k
        break;
5752
152k
    }
5753
152k
    if (writer == NULL) {
5754
206
        return -1;
5755
206
    }
5756
5757
152k
    const char *start = PyBytesWriter_GetData(writer);
5758
152k
    Py_ssize_t len = end - start;
5759
5760
152k
    char *cache = PyMem_Malloc(len + 1);
5761
152k
    if (cache == NULL) {
5762
0
        PyBytesWriter_Discard(writer);
5763
0
        PyErr_NoMemory();
5764
0
        return -1;
5765
0
    }
5766
152k
    memcpy(cache, start, len);
5767
152k
    cache[len] = '\0';
5768
152k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5769
152k
    PyUnicode_SET_UTF8(unicode, cache);
5770
152k
    PyBytesWriter_Discard(writer);
5771
152k
    return 0;
5772
152k
}
5773
5774
PyObject *
5775
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5776
16.8M
{
5777
16.8M
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5778
16.8M
}
5779
5780
5781
PyObject *
5782
PyUnicode_AsUTF8String(PyObject *unicode)
5783
2.46k
{
5784
2.46k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5785
2.46k
}
5786
5787
/* --- UTF-32 Codec ------------------------------------------------------- */
5788
5789
PyObject *
5790
PyUnicode_DecodeUTF32(const char *s,
5791
                      Py_ssize_t size,
5792
                      const char *errors,
5793
                      int *byteorder)
5794
158
{
5795
158
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5796
158
}
5797
5798
PyObject *
5799
PyUnicode_DecodeUTF32Stateful(const char *s,
5800
                              Py_ssize_t size,
5801
                              const char *errors,
5802
                              int *byteorder,
5803
                              Py_ssize_t *consumed)
5804
32.2k
{
5805
32.2k
    const char *starts = s;
5806
32.2k
    Py_ssize_t startinpos;
5807
32.2k
    Py_ssize_t endinpos;
5808
32.2k
    _PyUnicodeWriter writer;
5809
32.2k
    const unsigned char *q, *e;
5810
32.2k
    int le, bo = 0;       /* assume native ordering by default */
5811
32.2k
    const char *encoding;
5812
32.2k
    const char *errmsg = "";
5813
32.2k
    PyObject *errorHandler = NULL;
5814
32.2k
    PyObject *exc = NULL;
5815
5816
32.2k
    q = (const unsigned char *)s;
5817
32.2k
    e = q + size;
5818
5819
32.2k
    if (byteorder)
5820
32.1k
        bo = *byteorder;
5821
5822
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5823
       byte order setting accordingly. In native mode, the leading BOM
5824
       mark is skipped, in all other modes, it is copied to the output
5825
       stream as-is (giving a ZWNBSP character). */
5826
32.2k
    if (bo == 0 && size >= 4) {
5827
29.9k
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5828
29.9k
        if (bom == 0x0000FEFF) {
5829
164
            bo = -1;
5830
164
            q += 4;
5831
164
        }
5832
29.7k
        else if (bom == 0xFFFE0000) {
5833
270
            bo = 1;
5834
270
            q += 4;
5835
270
        }
5836
29.9k
        if (byteorder)
5837
29.7k
            *byteorder = bo;
5838
29.9k
    }
5839
5840
32.2k
    if (q == e) {
5841
62
        if (consumed)
5842
0
            *consumed = size;
5843
62
        _Py_RETURN_UNICODE_EMPTY();
5844
62
    }
5845
5846
#ifdef WORDS_BIGENDIAN
5847
    le = bo < 0;
5848
#else
5849
32.2k
    le = bo <= 0;
5850
32.2k
#endif
5851
32.2k
    encoding = le ? "utf-32-le" : "utf-32-be";
5852
5853
32.2k
    _PyUnicodeWriter_Init(&writer);
5854
32.2k
    writer.min_length = (e - q + 3) / 4;
5855
32.2k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5856
0
        goto onError;
5857
5858
108k
    while (1) {
5859
108k
        Py_UCS4 ch = 0;
5860
108k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5861
5862
108k
        if (e - q >= 4) {
5863
85.6k
            int kind = writer.kind;
5864
85.6k
            void *data = writer.data;
5865
85.6k
            const unsigned char *last = e - 4;
5866
85.6k
            Py_ssize_t pos = writer.pos;
5867
85.6k
            if (le) {
5868
2.68M
                do {
5869
2.68M
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5870
2.68M
                    if (ch > maxch)
5871
80.7k
                        break;
5872
2.60M
                    if (kind != PyUnicode_1BYTE_KIND &&
5873
2.57M
                        Py_UNICODE_IS_SURROGATE(ch))
5874
222
                        break;
5875
2.60M
                    PyUnicode_WRITE(kind, data, pos++, ch);
5876
2.60M
                    q += 4;
5877
2.60M
                } while (q <= last);
5878
82.0k
            }
5879
3.54k
            else {
5880
6.32k
                do {
5881
6.32k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5882
6.32k
                    if (ch > maxch)
5883
3.23k
                        break;
5884
3.08k
                    if (kind != PyUnicode_1BYTE_KIND &&
5885
2.55k
                        Py_UNICODE_IS_SURROGATE(ch))
5886
120
                        break;
5887
2.96k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5888
2.96k
                    q += 4;
5889
2.96k
                } while (q <= last);
5890
3.54k
            }
5891
85.6k
            writer.pos = pos;
5892
85.6k
        }
5893
5894
108k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
5895
347
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5896
347
            startinpos = ((const char *)q) - starts;
5897
347
            endinpos = startinpos + 4;
5898
347
        }
5899
107k
        else if (ch <= maxch) {
5900
24.0k
            if (q == e || consumed)
5901
5.20k
                break;
5902
            /* remaining bytes at the end? (size should be divisible by 4) */
5903
18.8k
            errmsg = "truncated data";
5904
18.8k
            startinpos = ((const char *)q) - starts;
5905
18.8k
            endinpos = ((const char *)e) - starts;
5906
18.8k
        }
5907
83.9k
        else {
5908
83.9k
            if (ch < 0x110000) {
5909
5.06k
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5910
0
                    goto onError;
5911
5.06k
                q += 4;
5912
5.06k
                continue;
5913
5.06k
            }
5914
78.8k
            errmsg = "code point not in range(0x110000)";
5915
78.8k
            startinpos = ((const char *)q) - starts;
5916
78.8k
            endinpos = startinpos + 4;
5917
78.8k
        }
5918
5919
        /* The remaining input chars are ignored if the callback
5920
           chooses to skip the input */
5921
98.0k
        if (unicode_decode_call_errorhandler_writer(
5922
98.0k
                errors, &errorHandler,
5923
98.0k
                encoding, errmsg,
5924
98.0k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5925
98.0k
                &writer))
5926
27.0k
            goto onError;
5927
98.0k
    }
5928
5929
5.20k
    if (consumed)
5930
0
        *consumed = (const char *)q-starts;
5931
5932
5.20k
    Py_XDECREF(errorHandler);
5933
5.20k
    Py_XDECREF(exc);
5934
5.20k
    return _PyUnicodeWriter_Finish(&writer);
5935
5936
27.0k
  onError:
5937
27.0k
    _PyUnicodeWriter_Dealloc(&writer);
5938
27.0k
    Py_XDECREF(errorHandler);
5939
27.0k
    Py_XDECREF(exc);
5940
27.0k
    return NULL;
5941
32.2k
}
5942
5943
PyObject *
5944
_PyUnicode_EncodeUTF32(PyObject *str,
5945
                       const char *errors,
5946
                       int byteorder)
5947
0
{
5948
0
    if (!PyUnicode_Check(str)) {
5949
0
        PyErr_BadArgument();
5950
0
        return NULL;
5951
0
    }
5952
0
    int kind = PyUnicode_KIND(str);
5953
0
    const void *data = PyUnicode_DATA(str);
5954
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
5955
5956
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5957
0
        return PyErr_NoMemory();
5958
0
    Py_ssize_t nsize = len + (byteorder == 0);
5959
5960
0
#if PY_LITTLE_ENDIAN
5961
0
    int native_ordering = byteorder <= 0;
5962
#else
5963
    int native_ordering = byteorder >= 0;
5964
#endif
5965
5966
0
    if (kind == PyUnicode_1BYTE_KIND) {
5967
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
5968
        // on short strings
5969
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5970
0
        if (v == NULL) {
5971
0
            return NULL;
5972
0
        }
5973
5974
        /* output buffer is 4-bytes aligned */
5975
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5976
0
        uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
5977
0
        if (byteorder == 0) {
5978
0
            *out++ = 0xFEFF;
5979
0
        }
5980
0
        if (len > 0) {
5981
0
            ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
5982
0
                                 &out, native_ordering);
5983
0
        }
5984
0
        return v;
5985
0
    }
5986
5987
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
5988
0
    if (writer == NULL) {
5989
0
        return NULL;
5990
0
    }
5991
5992
    /* output buffer is 4-bytes aligned */
5993
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
5994
0
    uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
5995
0
    if (byteorder == 0) {
5996
0
        *out++ = 0xFEFF;
5997
0
    }
5998
0
    if (len == 0) {
5999
0
        return PyBytesWriter_Finish(writer);
6000
0
    }
6001
6002
0
    const char *encoding;
6003
0
    if (byteorder == -1)
6004
0
        encoding = "utf-32-le";
6005
0
    else if (byteorder == 1)
6006
0
        encoding = "utf-32-be";
6007
0
    else
6008
0
        encoding = "utf-32";
6009
6010
0
    PyObject *errorHandler = NULL;
6011
0
    PyObject *exc = NULL;
6012
0
    PyObject *rep = NULL;
6013
6014
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6015
0
        if (kind == PyUnicode_2BYTE_KIND) {
6016
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6017
0
                                        &out, native_ordering);
6018
0
        }
6019
0
        else {
6020
0
            assert(kind == PyUnicode_4BYTE_KIND);
6021
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6022
0
                                        &out, native_ordering);
6023
0
        }
6024
0
        if (pos == len)
6025
0
            break;
6026
6027
0
        Py_ssize_t newpos;
6028
0
        rep = unicode_encode_call_errorhandler(
6029
0
                errors, &errorHandler,
6030
0
                encoding, "surrogates not allowed",
6031
0
                str, &exc, pos, pos + 1, &newpos);
6032
0
        if (!rep)
6033
0
            goto error;
6034
6035
0
        Py_ssize_t repsize, moreunits;
6036
0
        if (PyBytes_Check(rep)) {
6037
0
            repsize = PyBytes_GET_SIZE(rep);
6038
0
            if (repsize & 3) {
6039
0
                raise_encode_exception(&exc, encoding,
6040
0
                                       str, pos, pos + 1,
6041
0
                                       "surrogates not allowed");
6042
0
                goto error;
6043
0
            }
6044
0
            moreunits = repsize / 4;
6045
0
        }
6046
0
        else {
6047
0
            assert(PyUnicode_Check(rep));
6048
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6049
0
            if (!PyUnicode_IS_ASCII(rep)) {
6050
0
                raise_encode_exception(&exc, encoding,
6051
0
                                       str, pos, pos + 1,
6052
0
                                       "surrogates not allowed");
6053
0
                goto error;
6054
0
            }
6055
0
        }
6056
0
        moreunits += pos - newpos;
6057
0
        pos = newpos;
6058
6059
        /* four bytes are reserved for each surrogate */
6060
0
        if (moreunits > 0) {
6061
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
6062
0
            if (out == NULL) {
6063
0
                goto error;
6064
0
            }
6065
0
        }
6066
6067
0
        if (PyBytes_Check(rep)) {
6068
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6069
0
            out += repsize / 4;
6070
0
        }
6071
0
        else {
6072
            /* rep is unicode */
6073
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6074
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6075
0
                                 &out, native_ordering);
6076
0
        }
6077
6078
0
        Py_CLEAR(rep);
6079
0
    }
6080
6081
0
    Py_XDECREF(errorHandler);
6082
0
    Py_XDECREF(exc);
6083
6084
    /* Cut back to size actually needed. This is necessary for, for example,
6085
       encoding of a string containing isolated surrogates and the 'ignore'
6086
       handler is used. */
6087
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6088
6089
0
  error:
6090
0
    Py_XDECREF(rep);
6091
0
    Py_XDECREF(errorHandler);
6092
0
    Py_XDECREF(exc);
6093
0
    PyBytesWriter_Discard(writer);
6094
0
    return NULL;
6095
0
}
6096
6097
PyObject *
6098
PyUnicode_AsUTF32String(PyObject *unicode)
6099
0
{
6100
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6101
0
}
6102
6103
/* --- UTF-16 Codec ------------------------------------------------------- */
6104
6105
PyObject *
6106
PyUnicode_DecodeUTF16(const char *s,
6107
                      Py_ssize_t size,
6108
                      const char *errors,
6109
                      int *byteorder)
6110
172
{
6111
172
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6112
172
}
6113
6114
PyObject *
6115
PyUnicode_DecodeUTF16Stateful(const char *s,
6116
                              Py_ssize_t size,
6117
                              const char *errors,
6118
                              int *byteorder,
6119
                              Py_ssize_t *consumed)
6120
17.0k
{
6121
17.0k
    const char *starts = s;
6122
17.0k
    Py_ssize_t startinpos;
6123
17.0k
    Py_ssize_t endinpos;
6124
17.0k
    _PyUnicodeWriter writer;
6125
17.0k
    const unsigned char *q, *e;
6126
17.0k
    int bo = 0;       /* assume native ordering by default */
6127
17.0k
    int native_ordering;
6128
17.0k
    const char *errmsg = "";
6129
17.0k
    PyObject *errorHandler = NULL;
6130
17.0k
    PyObject *exc = NULL;
6131
17.0k
    const char *encoding;
6132
6133
17.0k
    q = (const unsigned char *)s;
6134
17.0k
    e = q + size;
6135
6136
17.0k
    if (byteorder)
6137
16.9k
        bo = *byteorder;
6138
6139
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6140
       byte order setting accordingly. In native mode, the leading BOM
6141
       mark is skipped, in all other modes, it is copied to the output
6142
       stream as-is (giving a ZWNBSP character). */
6143
17.0k
    if (bo == 0 && size >= 2) {
6144
16.2k
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6145
16.2k
        if (bom == 0xFEFF) {
6146
265
            q += 2;
6147
265
            bo = -1;
6148
265
        }
6149
15.9k
        else if (bom == 0xFFFE) {
6150
2.44k
            q += 2;
6151
2.44k
            bo = 1;
6152
2.44k
        }
6153
16.2k
        if (byteorder)
6154
16.0k
            *byteorder = bo;
6155
16.2k
    }
6156
6157
17.0k
    if (q == e) {
6158
69
        if (consumed)
6159
0
            *consumed = size;
6160
69
        _Py_RETURN_UNICODE_EMPTY();
6161
69
    }
6162
6163
17.0k
#if PY_LITTLE_ENDIAN
6164
17.0k
    native_ordering = bo <= 0;
6165
17.0k
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6166
#else
6167
    native_ordering = bo >= 0;
6168
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6169
#endif
6170
6171
    /* Note: size will always be longer than the resulting Unicode
6172
       character count normally.  Error handler will take care of
6173
       resizing when needed. */
6174
17.0k
    _PyUnicodeWriter_Init(&writer);
6175
17.0k
    writer.min_length = (e - q + 1) / 2;
6176
17.0k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6177
0
        goto onError;
6178
6179
58.8k
    while (1) {
6180
58.8k
        Py_UCS4 ch = 0;
6181
58.8k
        if (e - q >= 2) {
6182
49.8k
            int kind = writer.kind;
6183
49.8k
            if (kind == PyUnicode_1BYTE_KIND) {
6184
20.0k
                if (PyUnicode_IS_ASCII(writer.buffer))
6185
16.4k
                    ch = asciilib_utf16_decode(&q, e,
6186
16.4k
                            (Py_UCS1*)writer.data, &writer.pos,
6187
16.4k
                            native_ordering);
6188
3.63k
                else
6189
3.63k
                    ch = ucs1lib_utf16_decode(&q, e,
6190
3.63k
                            (Py_UCS1*)writer.data, &writer.pos,
6191
3.63k
                            native_ordering);
6192
29.8k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6193
16.1k
                ch = ucs2lib_utf16_decode(&q, e,
6194
16.1k
                        (Py_UCS2*)writer.data, &writer.pos,
6195
16.1k
                        native_ordering);
6196
16.1k
            } else {
6197
13.6k
                assert(kind == PyUnicode_4BYTE_KIND);
6198
13.6k
                ch = ucs4lib_utf16_decode(&q, e,
6199
13.6k
                        (Py_UCS4*)writer.data, &writer.pos,
6200
13.6k
                        native_ordering);
6201
13.6k
            }
6202
49.8k
        }
6203
6204
58.8k
        switch (ch)
6205
58.8k
        {
6206
17.7k
        case 0:
6207
            /* remaining byte at the end? (size should be even) */
6208
17.7k
            if (q == e || consumed)
6209
11.6k
                goto End;
6210
6.12k
            errmsg = "truncated data";
6211
6.12k
            startinpos = ((const char *)q) - starts;
6212
6.12k
            endinpos = ((const char *)e) - starts;
6213
6.12k
            break;
6214
            /* The remaining input chars are ignored if the callback
6215
               chooses to skip the input */
6216
1.72k
        case 1:
6217
1.72k
            q -= 2;
6218
1.72k
            if (consumed)
6219
0
                goto End;
6220
1.72k
            errmsg = "unexpected end of data";
6221
1.72k
            startinpos = ((const char *)q) - starts;
6222
1.72k
            endinpos = ((const char *)e) - starts;
6223
1.72k
            break;
6224
12.2k
        case 2:
6225
12.2k
            errmsg = "illegal encoding";
6226
12.2k
            startinpos = ((const char *)q) - 2 - starts;
6227
12.2k
            endinpos = startinpos + 2;
6228
12.2k
            break;
6229
8.78k
        case 3:
6230
8.78k
            errmsg = "illegal UTF-16 surrogate";
6231
8.78k
            startinpos = ((const char *)q) - 4 - starts;
6232
8.78k
            endinpos = startinpos + 2;
6233
8.78k
            break;
6234
18.3k
        default:
6235
18.3k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6236
0
                goto onError;
6237
18.3k
            continue;
6238
58.8k
        }
6239
6240
28.8k
        if (unicode_decode_call_errorhandler_writer(
6241
28.8k
                errors,
6242
28.8k
                &errorHandler,
6243
28.8k
                encoding, errmsg,
6244
28.8k
                &starts,
6245
28.8k
                (const char **)&e,
6246
28.8k
                &startinpos,
6247
28.8k
                &endinpos,
6248
28.8k
                &exc,
6249
28.8k
                (const char **)&q,
6250
28.8k
                &writer))
6251
5.38k
            goto onError;
6252
28.8k
    }
6253
6254
11.6k
End:
6255
11.6k
    if (consumed)
6256
0
        *consumed = (const char *)q-starts;
6257
6258
11.6k
    Py_XDECREF(errorHandler);
6259
11.6k
    Py_XDECREF(exc);
6260
11.6k
    return _PyUnicodeWriter_Finish(&writer);
6261
6262
5.38k
  onError:
6263
5.38k
    _PyUnicodeWriter_Dealloc(&writer);
6264
5.38k
    Py_XDECREF(errorHandler);
6265
5.38k
    Py_XDECREF(exc);
6266
5.38k
    return NULL;
6267
17.0k
}
6268
6269
PyObject *
6270
_PyUnicode_EncodeUTF16(PyObject *str,
6271
                       const char *errors,
6272
                       int byteorder)
6273
6.75k
{
6274
6.75k
    if (!PyUnicode_Check(str)) {
6275
0
        PyErr_BadArgument();
6276
0
        return NULL;
6277
0
    }
6278
6.75k
    int kind = PyUnicode_KIND(str);
6279
6.75k
    const void *data = PyUnicode_DATA(str);
6280
6.75k
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6281
6282
6.75k
    Py_ssize_t pairs = 0;
6283
6.75k
    if (kind == PyUnicode_4BYTE_KIND) {
6284
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6285
0
        const Py_UCS4 *end = in + len;
6286
0
        while (in < end) {
6287
0
            if (*in++ >= 0x10000) {
6288
0
                pairs++;
6289
0
            }
6290
0
        }
6291
0
    }
6292
6.75k
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6293
0
        return PyErr_NoMemory();
6294
0
    }
6295
6.75k
    Py_ssize_t nsize = len + pairs + (byteorder == 0);
6296
6297
#if PY_BIG_ENDIAN
6298
    int native_ordering = byteorder >= 0;
6299
#else
6300
6.75k
    int native_ordering = byteorder <= 0;
6301
6.75k
#endif
6302
6303
6.75k
    if (kind == PyUnicode_1BYTE_KIND) {
6304
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6305
        // on short strings
6306
6.68k
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6307
6.68k
        if (v == NULL) {
6308
0
            return NULL;
6309
0
        }
6310
6311
        /* output buffer is 2-bytes aligned */
6312
6.68k
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6313
6.68k
        unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v);
6314
6.68k
        if (byteorder == 0) {
6315
0
            *out++ = 0xFEFF;
6316
0
        }
6317
6.68k
        if (len > 0) {
6318
6.68k
            ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6319
6.68k
        }
6320
6.68k
        return v;
6321
6.68k
    }
6322
6323
67
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2);
6324
67
    if (writer == NULL) {
6325
0
        return NULL;
6326
0
    }
6327
6328
    /* output buffer is 2-bytes aligned */
6329
67
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2));
6330
67
    unsigned short *out = PyBytesWriter_GetData(writer);
6331
67
    if (byteorder == 0) {
6332
0
        *out++ = 0xFEFF;
6333
0
    }
6334
67
    if (len == 0) {
6335
0
        return PyBytesWriter_Finish(writer);
6336
0
    }
6337
6338
67
    const char *encoding;
6339
67
    if (byteorder < 0) {
6340
0
        encoding = "utf-16-le";
6341
0
    }
6342
67
    else if (byteorder > 0) {
6343
67
        encoding = "utf-16-be";
6344
67
    }
6345
0
    else {
6346
0
        encoding = "utf-16";
6347
0
    }
6348
6349
67
    PyObject *errorHandler = NULL;
6350
67
    PyObject *exc = NULL;
6351
67
    PyObject *rep = NULL;
6352
6353
67
    for (Py_ssize_t pos = 0; pos < len; ) {
6354
67
        if (kind == PyUnicode_2BYTE_KIND) {
6355
67
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6356
67
                                        &out, native_ordering);
6357
67
        }
6358
0
        else {
6359
0
            assert(kind == PyUnicode_4BYTE_KIND);
6360
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6361
0
                                        &out, native_ordering);
6362
0
        }
6363
67
        if (pos == len)
6364
67
            break;
6365
6366
0
        Py_ssize_t newpos;
6367
0
        rep = unicode_encode_call_errorhandler(
6368
0
                errors, &errorHandler,
6369
0
                encoding, "surrogates not allowed",
6370
0
                str, &exc, pos, pos + 1, &newpos);
6371
0
        if (!rep)
6372
0
            goto error;
6373
6374
0
        Py_ssize_t repsize, moreunits;
6375
0
        if (PyBytes_Check(rep)) {
6376
0
            repsize = PyBytes_GET_SIZE(rep);
6377
0
            if (repsize & 1) {
6378
0
                raise_encode_exception(&exc, encoding,
6379
0
                                       str, pos, pos + 1,
6380
0
                                       "surrogates not allowed");
6381
0
                goto error;
6382
0
            }
6383
0
            moreunits = repsize / 2;
6384
0
        }
6385
0
        else {
6386
0
            assert(PyUnicode_Check(rep));
6387
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6388
0
            if (!PyUnicode_IS_ASCII(rep)) {
6389
0
                raise_encode_exception(&exc, encoding,
6390
0
                                       str, pos, pos + 1,
6391
0
                                       "surrogates not allowed");
6392
0
                goto error;
6393
0
            }
6394
0
        }
6395
0
        moreunits += pos - newpos;
6396
0
        pos = newpos;
6397
6398
        /* two bytes are reserved for each surrogate */
6399
0
        if (moreunits > 0) {
6400
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits, out);
6401
0
            if (out == NULL) {
6402
0
                goto error;
6403
0
            }
6404
0
        }
6405
6406
0
        if (PyBytes_Check(rep)) {
6407
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6408
0
            out += repsize / 2;
6409
0
        } else {
6410
            /* rep is unicode */
6411
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6412
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6413
0
                                 &out, native_ordering);
6414
0
        }
6415
6416
0
        Py_CLEAR(rep);
6417
0
    }
6418
6419
67
    Py_XDECREF(errorHandler);
6420
67
    Py_XDECREF(exc);
6421
6422
    /* Cut back to size actually needed. This is necessary for, for example,
6423
    encoding of a string containing isolated surrogates and the 'ignore' handler
6424
    is used. */
6425
67
    return PyBytesWriter_FinishWithPointer(writer, out);
6426
6427
0
  error:
6428
0
    Py_XDECREF(rep);
6429
0
    Py_XDECREF(errorHandler);
6430
0
    Py_XDECREF(exc);
6431
0
    PyBytesWriter_Discard(writer);
6432
0
    return NULL;
6433
67
}
6434
6435
PyObject *
6436
PyUnicode_AsUTF16String(PyObject *unicode)
6437
0
{
6438
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6439
0
}
6440
6441
_PyUnicode_Name_CAPI *
6442
_PyUnicode_GetNameCAPI(void)
6443
15.3k
{
6444
15.3k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6445
15.3k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6446
6447
15.3k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6448
15.3k
    if (ucnhash_capi == NULL) {
6449
2
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6450
2
                PyUnicodeData_CAPSULE_NAME, 1);
6451
6452
        // It's fine if we overwrite the value here. It's always the same value.
6453
2
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6454
2
    }
6455
15.3k
    return ucnhash_capi;
6456
15.3k
}
6457
6458
/* --- Unicode Escape Codec ----------------------------------------------- */
6459
6460
PyObject *
6461
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6462
                               Py_ssize_t size,
6463
                               const char *errors,
6464
                               Py_ssize_t *consumed,
6465
                               int *first_invalid_escape_char,
6466
                               const char **first_invalid_escape_ptr)
6467
28.6k
{
6468
28.6k
    const char *starts = s;
6469
28.6k
    const char *initial_starts = starts;
6470
28.6k
    _PyUnicodeWriter writer;
6471
28.6k
    const char *end;
6472
28.6k
    PyObject *errorHandler = NULL;
6473
28.6k
    PyObject *exc = NULL;
6474
28.6k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6475
6476
    // so we can remember if we've seen an invalid escape char or not
6477
28.6k
    *first_invalid_escape_char = -1;
6478
28.6k
    *first_invalid_escape_ptr = NULL;
6479
6480
28.6k
    if (size == 0) {
6481
2.48k
        if (consumed) {
6482
0
            *consumed = 0;
6483
0
        }
6484
2.48k
        _Py_RETURN_UNICODE_EMPTY();
6485
2.48k
    }
6486
    /* Escaped strings will always be longer than the resulting
6487
       Unicode string, so we start with size here and then reduce the
6488
       length after conversion to the true value.
6489
       (but if the error callback returns a long replacement string
6490
       we'll have to allocate more space) */
6491
26.2k
    _PyUnicodeWriter_Init(&writer);
6492
26.2k
    writer.min_length = size;
6493
26.2k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6494
0
        goto onError;
6495
0
    }
6496
6497
26.2k
    end = s + size;
6498
8.94M
    while (s < end) {
6499
8.92M
        unsigned char c = (unsigned char) *s++;
6500
8.92M
        Py_UCS4 ch;
6501
8.92M
        int count;
6502
8.92M
        const char *message;
6503
6504
8.92M
#define WRITE_ASCII_CHAR(ch)                                                  \
6505
8.92M
            do {                                                              \
6506
114k
                assert(ch <= 127);                                            \
6507
114k
                assert(writer.pos < writer.size);                             \
6508
114k
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6509
114k
            } while(0)
6510
6511
8.92M
#define WRITE_CHAR(ch)                                                        \
6512
8.92M
            do {                                                              \
6513
8.84M
                if (ch <= writer.maxchar) {                                   \
6514
8.82M
                    assert(writer.pos < writer.size);                         \
6515
8.82M
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6516
8.82M
                }                                                             \
6517
8.84M
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6518
0
                    goto onError;                                             \
6519
0
                }                                                             \
6520
8.84M
            } while(0)
6521
6522
        /* Non-escape characters are interpreted as Unicode ordinals */
6523
8.92M
        if (c != '\\') {
6524
8.64M
            WRITE_CHAR(c);
6525
8.64M
            continue;
6526
8.64M
        }
6527
6528
278k
        Py_ssize_t startinpos = s - starts - 1;
6529
        /* \ - Escapes */
6530
278k
        if (s >= end) {
6531
15
            message = "\\ at end of string";
6532
15
            goto incomplete;
6533
15
        }
6534
278k
        c = (unsigned char) *s++;
6535
6536
278k
        assert(writer.pos < writer.size);
6537
278k
        switch (c) {
6538
6539
            /* \x escapes */
6540
1.03k
        case '\n': continue;
6541
37.0k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6542
1.82k
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6543
3.41k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6544
3.00k
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6545
        /* FF */
6546
6.73k
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6547
1.57k
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6548
2.13k
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6549
6.54k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6550
        /* VT */
6551
14.1k
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6552
        /* BEL, not classic C */
6553
2.39k
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6554
6555
            /* \OOO (octal) escapes */
6556
42.1k
        case '0': case '1': case '2': case '3':
6557
108k
        case '4': case '5': case '6': case '7':
6558
108k
            ch = c - '0';
6559
108k
            if (s < end && '0' <= *s && *s <= '7') {
6560
64.7k
                ch = (ch<<3) + *s++ - '0';
6561
64.7k
                if (s < end && '0' <= *s && *s <= '7') {
6562
51.1k
                    ch = (ch<<3) + *s++ - '0';
6563
51.1k
                }
6564
64.7k
            }
6565
108k
            if (ch > 0377) {
6566
49.0k
                if (*first_invalid_escape_char == -1) {
6567
1.28k
                    *first_invalid_escape_char = ch;
6568
1.28k
                    if (starts == initial_starts) {
6569
                        /* Back up 3 chars, since we've already incremented s. */
6570
1.28k
                        *first_invalid_escape_ptr = s - 3;
6571
1.28k
                    }
6572
1.28k
                }
6573
49.0k
            }
6574
108k
            WRITE_CHAR(ch);
6575
108k
            continue;
6576
6577
            /* hex escapes */
6578
            /* \xXX */
6579
108k
        case 'x':
6580
14.7k
            count = 2;
6581
14.7k
            message = "truncated \\xXX escape";
6582
14.7k
            goto hexescape;
6583
6584
            /* \uXXXX */
6585
6.73k
        case 'u':
6586
6.73k
            count = 4;
6587
6.73k
            message = "truncated \\uXXXX escape";
6588
6.73k
            goto hexescape;
6589
6590
            /* \UXXXXXXXX */
6591
17.1k
        case 'U':
6592
17.1k
            count = 8;
6593
17.1k
            message = "truncated \\UXXXXXXXX escape";
6594
38.6k
        hexescape:
6595
232k
            for (ch = 0; count; ++s, --count) {
6596
193k
                if (s >= end) {
6597
20
                    goto incomplete;
6598
20
                }
6599
193k
                c = (unsigned char)*s;
6600
193k
                ch <<= 4;
6601
193k
                if (c >= '0' && c <= '9') {
6602
138k
                    ch += c - '0';
6603
138k
                }
6604
54.6k
                else if (c >= 'a' && c <= 'f') {
6605
54.2k
                    ch += c - ('a' - 10);
6606
54.2k
                }
6607
395
                else if (c >= 'A' && c <= 'F') {
6608
363
                    ch += c - ('A' - 10);
6609
363
                }
6610
32
                else {
6611
32
                    goto error;
6612
32
                }
6613
193k
            }
6614
6615
            /* when we get here, ch is a 32-bit unicode character */
6616
38.6k
            if (ch > MAX_UNICODE) {
6617
8
                message = "illegal Unicode character";
6618
8
                goto error;
6619
8
            }
6620
6621
38.5k
            WRITE_CHAR(ch);
6622
38.5k
            continue;
6623
6624
            /* \N{name} */
6625
38.5k
        case 'N':
6626
15.3k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6627
15.3k
            if (ucnhash_capi == NULL) {
6628
0
                PyErr_SetString(
6629
0
                        PyExc_UnicodeError,
6630
0
                        "\\N escapes not supported (can't load unicodedata module)"
6631
0
                );
6632
0
                goto onError;
6633
0
            }
6634
6635
15.3k
            message = "malformed \\N character escape";
6636
15.3k
            if (s >= end) {
6637
8
                goto incomplete;
6638
8
            }
6639
15.3k
            if (*s == '{') {
6640
15.3k
                const char *start = ++s;
6641
15.3k
                size_t namelen;
6642
                /* look for the closing brace */
6643
80.3k
                while (s < end && *s != '}')
6644
64.9k
                    s++;
6645
15.3k
                if (s >= end) {
6646
24
                    goto incomplete;
6647
24
                }
6648
15.3k
                namelen = s - start;
6649
15.3k
                if (namelen) {
6650
                    /* found a name.  look it up in the unicode database */
6651
15.3k
                    s++;
6652
15.3k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6653
15.3k
                    if (namelen <= INT_MAX &&
6654
15.3k
                        ucnhash_capi->getcode(start, (int)namelen,
6655
15.3k
                                              &ch, 0)) {
6656
15.1k
                        assert(ch <= MAX_UNICODE);
6657
15.1k
                        WRITE_CHAR(ch);
6658
15.1k
                        continue;
6659
15.1k
                    }
6660
108
                    message = "unknown Unicode character name";
6661
108
                }
6662
15.3k
            }
6663
128
            goto error;
6664
6665
35.8k
        default:
6666
35.8k
            if (*first_invalid_escape_char == -1) {
6667
4.06k
                *first_invalid_escape_char = c;
6668
4.06k
                if (starts == initial_starts) {
6669
                    /* Back up one char, since we've already incremented s. */
6670
4.06k
                    *first_invalid_escape_ptr = s - 1;
6671
4.06k
                }
6672
4.06k
            }
6673
35.8k
            WRITE_ASCII_CHAR('\\');
6674
35.8k
            WRITE_CHAR(c);
6675
35.8k
            continue;
6676
278k
        }
6677
6678
67
      incomplete:
6679
67
        if (consumed) {
6680
0
            *consumed = startinpos;
6681
0
            break;
6682
0
        }
6683
235
      error:;
6684
235
        Py_ssize_t endinpos = s-starts;
6685
235
        writer.min_length = end - s + writer.pos;
6686
235
        if (unicode_decode_call_errorhandler_writer(
6687
235
                errors, &errorHandler,
6688
235
                "unicodeescape", message,
6689
235
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6690
235
                &writer)) {
6691
235
            goto onError;
6692
235
        }
6693
235
        assert(end - s <= writer.size - writer.pos);
6694
6695
0
#undef WRITE_ASCII_CHAR
6696
0
#undef WRITE_CHAR
6697
0
    }
6698
6699
25.9k
    Py_XDECREF(errorHandler);
6700
25.9k
    Py_XDECREF(exc);
6701
25.9k
    return _PyUnicodeWriter_Finish(&writer);
6702
6703
235
  onError:
6704
235
    _PyUnicodeWriter_Dealloc(&writer);
6705
235
    Py_XDECREF(errorHandler);
6706
235
    Py_XDECREF(exc);
6707
235
    return NULL;
6708
26.2k
}
6709
6710
PyObject *
6711
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6712
                              Py_ssize_t size,
6713
                              const char *errors,
6714
                              Py_ssize_t *consumed)
6715
545
{
6716
545
    int first_invalid_escape_char;
6717
545
    const char *first_invalid_escape_ptr;
6718
545
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6719
545
                                                      consumed,
6720
545
                                                      &first_invalid_escape_char,
6721
545
                                                      &first_invalid_escape_ptr);
6722
545
    if (result == NULL)
6723
122
        return NULL;
6724
423
    if (first_invalid_escape_char != -1) {
6725
303
        if (first_invalid_escape_char > 0xff) {
6726
96
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6727
96
                                 "\"\\%o\" is an invalid octal escape sequence. "
6728
96
                                 "Such sequences will not work in the future. ",
6729
96
                                 first_invalid_escape_char) < 0)
6730
0
            {
6731
0
                Py_DECREF(result);
6732
0
                return NULL;
6733
0
            }
6734
96
        }
6735
207
        else {
6736
207
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6737
207
                                 "\"\\%c\" is an invalid escape sequence. "
6738
207
                                 "Such sequences will not work in the future. ",
6739
207
                                 first_invalid_escape_char) < 0)
6740
0
            {
6741
0
                Py_DECREF(result);
6742
0
                return NULL;
6743
0
            }
6744
207
        }
6745
303
    }
6746
423
    return result;
6747
423
}
6748
6749
PyObject *
6750
PyUnicode_DecodeUnicodeEscape(const char *s,
6751
                              Py_ssize_t size,
6752
                              const char *errors)
6753
0
{
6754
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6755
0
}
6756
6757
/* Return a Unicode-Escape string version of the Unicode object. */
6758
6759
PyObject *
6760
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6761
392k
{
6762
392k
    if (!PyUnicode_Check(unicode)) {
6763
0
        PyErr_BadArgument();
6764
0
        return NULL;
6765
0
    }
6766
6767
392k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
6768
392k
    if (len == 0) {
6769
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
6770
0
    }
6771
392k
    int kind = PyUnicode_KIND(unicode);
6772
392k
    const void *data = PyUnicode_DATA(unicode);
6773
6774
    /* Initial allocation is based on the longest-possible character
6775
     * escape.
6776
     *
6777
     * For UCS1 strings it's '\xxx', 4 bytes per source character.
6778
     * For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6779
     * For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. */
6780
392k
    Py_ssize_t expandsize = kind * 2 + 2;
6781
392k
    if (len > PY_SSIZE_T_MAX / expandsize) {
6782
0
        return PyErr_NoMemory();
6783
0
    }
6784
6785
392k
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
6786
392k
    if (writer == NULL) {
6787
0
        return NULL;
6788
0
    }
6789
392k
    char *p = PyBytesWriter_GetData(writer);
6790
6791
784k
    for (Py_ssize_t i = 0; i < len; i++) {
6792
392k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6793
6794
        /* U+0000-U+00ff range */
6795
392k
        if (ch < 0x100) {
6796
386k
            if (ch >= ' ' && ch < 127) {
6797
43.9k
                if (ch != '\\') {
6798
                    /* Copy printable US ASCII as-is */
6799
0
                    *p++ = (char) ch;
6800
0
                }
6801
                /* Escape backslashes */
6802
43.9k
                else {
6803
43.9k
                    *p++ = '\\';
6804
43.9k
                    *p++ = '\\';
6805
43.9k
                }
6806
43.9k
            }
6807
6808
            /* Map special whitespace to '\t', \n', '\r' */
6809
342k
            else if (ch == '\t') {
6810
4.64k
                *p++ = '\\';
6811
4.64k
                *p++ = 't';
6812
4.64k
            }
6813
337k
            else if (ch == '\n') {
6814
2.26k
                *p++ = '\\';
6815
2.26k
                *p++ = 'n';
6816
2.26k
            }
6817
335k
            else if (ch == '\r') {
6818
666
                *p++ = '\\';
6819
666
                *p++ = 'r';
6820
666
            }
6821
6822
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6823
334k
            else {
6824
334k
                *p++ = '\\';
6825
334k
                *p++ = 'x';
6826
334k
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6827
334k
                *p++ = Py_hexdigits[ch & 0x000F];
6828
334k
            }
6829
386k
        }
6830
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6831
6.45k
        else if (ch < 0x10000) {
6832
5.24k
            *p++ = '\\';
6833
5.24k
            *p++ = 'u';
6834
5.24k
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6835
5.24k
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6836
5.24k
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6837
5.24k
            *p++ = Py_hexdigits[ch & 0x000F];
6838
5.24k
        }
6839
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6840
1.20k
        else {
6841
6842
            /* Make sure that the first two digits are zero */
6843
1.20k
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6844
1.20k
            *p++ = '\\';
6845
1.20k
            *p++ = 'U';
6846
1.20k
            *p++ = '0';
6847
1.20k
            *p++ = '0';
6848
1.20k
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6849
1.20k
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6850
1.20k
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6851
1.20k
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6852
1.20k
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6853
1.20k
            *p++ = Py_hexdigits[ch & 0x0000000F];
6854
1.20k
        }
6855
392k
    }
6856
6857
392k
    return PyBytesWriter_FinishWithPointer(writer, p);
6858
392k
}
6859
6860
/* --- Raw Unicode Escape Codec ------------------------------------------- */
6861
6862
PyObject *
6863
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6864
                                          Py_ssize_t size,
6865
                                          const char *errors,
6866
                                          Py_ssize_t *consumed)
6867
112
{
6868
112
    const char *starts = s;
6869
112
    _PyUnicodeWriter writer;
6870
112
    const char *end;
6871
112
    PyObject *errorHandler = NULL;
6872
112
    PyObject *exc = NULL;
6873
6874
112
    if (size == 0) {
6875
0
        if (consumed) {
6876
0
            *consumed = 0;
6877
0
        }
6878
0
        _Py_RETURN_UNICODE_EMPTY();
6879
0
    }
6880
6881
    /* Escaped strings will always be longer than the resulting
6882
       Unicode string, so we start with size here and then reduce the
6883
       length after conversion to the true value. (But decoding error
6884
       handler might have to resize the string) */
6885
112
    _PyUnicodeWriter_Init(&writer);
6886
112
    writer.min_length = size;
6887
112
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6888
0
        goto onError;
6889
0
    }
6890
6891
112
    end = s + size;
6892
46.7k
    while (s < end) {
6893
46.6k
        unsigned char c = (unsigned char) *s++;
6894
46.6k
        Py_UCS4 ch;
6895
46.6k
        int count;
6896
46.6k
        const char *message;
6897
6898
46.6k
#define WRITE_CHAR(ch)                                                        \
6899
46.6k
            do {                                                              \
6900
46.6k
                if (ch <= writer.maxchar) {                                   \
6901
46.5k
                    assert(writer.pos < writer.size);                         \
6902
46.5k
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6903
46.5k
                }                                                             \
6904
46.6k
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6905
0
                    goto onError;                                             \
6906
0
                }                                                             \
6907
46.6k
            } while(0)
6908
6909
        /* Non-escape characters are interpreted as Unicode ordinals */
6910
46.6k
        if (c != '\\' || (s >= end && !consumed)) {
6911
43.5k
            WRITE_CHAR(c);
6912
43.5k
            continue;
6913
43.5k
        }
6914
6915
3.07k
        Py_ssize_t startinpos = s - starts - 1;
6916
        /* \ - Escapes */
6917
3.07k
        if (s >= end) {
6918
0
            assert(consumed);
6919
            // Set message to silent compiler warning.
6920
            // Actually it is never used.
6921
0
            message = "\\ at end of string";
6922
0
            goto incomplete;
6923
0
        }
6924
6925
3.07k
        c = (unsigned char) *s++;
6926
3.07k
        if (c == 'u') {
6927
404
            count = 4;
6928
404
            message = "truncated \\uXXXX escape";
6929
404
        }
6930
2.66k
        else if (c == 'U') {
6931
542
            count = 8;
6932
542
            message = "truncated \\UXXXXXXXX escape";
6933
542
        }
6934
2.12k
        else {
6935
2.12k
            assert(writer.pos < writer.size);
6936
2.12k
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6937
2.12k
            WRITE_CHAR(c);
6938
2.12k
            continue;
6939
2.12k
        }
6940
6941
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6942
6.78k
        for (ch = 0; count; ++s, --count) {
6943
5.86k
            if (s >= end) {
6944
7
                goto incomplete;
6945
7
            }
6946
5.85k
            c = (unsigned char)*s;
6947
5.85k
            ch <<= 4;
6948
5.85k
            if (c >= '0' && c <= '9') {
6949
5.00k
                ch += c - '0';
6950
5.00k
            }
6951
850
            else if (c >= 'a' && c <= 'f') {
6952
736
                ch += c - ('a' - 10);
6953
736
            }
6954
114
            else if (c >= 'A' && c <= 'F') {
6955
95
                ch += c - ('A' - 10);
6956
95
            }
6957
19
            else {
6958
19
                goto error;
6959
19
            }
6960
5.85k
        }
6961
920
        if (ch > MAX_UNICODE) {
6962
3
            message = "\\Uxxxxxxxx out of range";
6963
3
            goto error;
6964
3
        }
6965
917
        WRITE_CHAR(ch);
6966
917
        continue;
6967
6968
917
      incomplete:
6969
7
        if (consumed) {
6970
0
            *consumed = startinpos;
6971
0
            break;
6972
0
        }
6973
29
      error:;
6974
29
        Py_ssize_t endinpos = s-starts;
6975
29
        writer.min_length = end - s + writer.pos;
6976
29
        if (unicode_decode_call_errorhandler_writer(
6977
29
                errors, &errorHandler,
6978
29
                "rawunicodeescape", message,
6979
29
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6980
29
                &writer)) {
6981
29
            goto onError;
6982
29
        }
6983
29
        assert(end - s <= writer.size - writer.pos);
6984
6985
0
#undef WRITE_CHAR
6986
0
    }
6987
83
    Py_XDECREF(errorHandler);
6988
83
    Py_XDECREF(exc);
6989
83
    return _PyUnicodeWriter_Finish(&writer);
6990
6991
29
  onError:
6992
29
    _PyUnicodeWriter_Dealloc(&writer);
6993
29
    Py_XDECREF(errorHandler);
6994
29
    Py_XDECREF(exc);
6995
29
    return NULL;
6996
112
}
6997
6998
PyObject *
6999
PyUnicode_DecodeRawUnicodeEscape(const char *s,
7000
                                 Py_ssize_t size,
7001
                                 const char *errors)
7002
0
{
7003
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
7004
0
}
7005
7006
7007
PyObject *
7008
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7009
237k
{
7010
237k
    if (!PyUnicode_Check(unicode)) {
7011
0
        PyErr_BadArgument();
7012
0
        return NULL;
7013
0
    }
7014
237k
    int kind = PyUnicode_KIND(unicode);
7015
237k
    const void *data = PyUnicode_DATA(unicode);
7016
237k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
7017
237k
    if (len == 0) {
7018
490
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7019
490
    }
7020
236k
    if (kind == PyUnicode_1BYTE_KIND) {
7021
236k
        return PyBytes_FromStringAndSize(data, len);
7022
236k
    }
7023
7024
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7025
       bytes, and 1 byte characters 4. */
7026
308
    Py_ssize_t expandsize = kind * 2 + 2;
7027
308
    if (len > PY_SSIZE_T_MAX / expandsize) {
7028
0
        return PyErr_NoMemory();
7029
0
    }
7030
7031
308
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
7032
308
    if (writer == NULL) {
7033
0
        return NULL;
7034
0
    }
7035
308
    char *p = PyBytesWriter_GetData(writer);
7036
7037
4.90M
    for (Py_ssize_t pos = 0; pos < len; pos++) {
7038
4.90M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7039
7040
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7041
4.90M
        if (ch < 0x100) {
7042
4.86M
            *p++ = (char) ch;
7043
4.86M
        }
7044
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7045
35.2k
        else if (ch < 0x10000) {
7046
34.6k
            *p++ = '\\';
7047
34.6k
            *p++ = 'u';
7048
34.6k
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7049
34.6k
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7050
34.6k
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7051
34.6k
            *p++ = Py_hexdigits[ch & 15];
7052
34.6k
        }
7053
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7054
545
        else {
7055
545
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7056
545
            *p++ = '\\';
7057
545
            *p++ = 'U';
7058
545
            *p++ = '0';
7059
545
            *p++ = '0';
7060
545
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7061
545
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7062
545
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7063
545
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7064
545
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7065
545
            *p++ = Py_hexdigits[ch & 15];
7066
545
        }
7067
4.90M
    }
7068
7069
308
    return PyBytesWriter_FinishWithPointer(writer, p);
7070
308
}
7071
7072
/* --- Latin-1 Codec ------------------------------------------------------ */
7073
7074
PyObject *
7075
PyUnicode_DecodeLatin1(const char *s,
7076
                       Py_ssize_t size,
7077
                       const char *errors)
7078
2.88M
{
7079
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7080
2.88M
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7081
2.88M
}
7082
7083
/* create or adjust a UnicodeEncodeError */
7084
static void
7085
make_encode_exception(PyObject **exceptionObject,
7086
                      const char *encoding,
7087
                      PyObject *unicode,
7088
                      Py_ssize_t startpos, Py_ssize_t endpos,
7089
                      const char *reason)
7090
733k
{
7091
733k
    if (*exceptionObject == NULL) {
7092
733k
        *exceptionObject = PyObject_CallFunction(
7093
733k
            PyExc_UnicodeEncodeError, "sOnns",
7094
733k
            encoding, unicode, startpos, endpos, reason);
7095
733k
    }
7096
0
    else {
7097
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7098
0
            goto onError;
7099
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7100
0
            goto onError;
7101
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7102
0
            goto onError;
7103
0
        return;
7104
0
      onError:
7105
0
        Py_CLEAR(*exceptionObject);
7106
0
    }
7107
733k
}
7108
7109
/* raises a UnicodeEncodeError */
7110
static void
7111
raise_encode_exception(PyObject **exceptionObject,
7112
                       const char *encoding,
7113
                       PyObject *unicode,
7114
                       Py_ssize_t startpos, Py_ssize_t endpos,
7115
                       const char *reason)
7116
573k
{
7117
573k
    make_encode_exception(exceptionObject,
7118
573k
                          encoding, unicode, startpos, endpos, reason);
7119
573k
    if (*exceptionObject != NULL)
7120
573k
        PyCodec_StrictErrors(*exceptionObject);
7121
573k
}
7122
7123
/* error handling callback helper:
7124
   build arguments, call the callback and check the arguments,
7125
   put the result into newpos and return the replacement string, which
7126
   has to be freed by the caller */
7127
static PyObject *
7128
unicode_encode_call_errorhandler(const char *errors,
7129
                                 PyObject **errorHandler,
7130
                                 const char *encoding, const char *reason,
7131
                                 PyObject *unicode, PyObject **exceptionObject,
7132
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7133
                                 Py_ssize_t *newpos)
7134
159k
{
7135
159k
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7136
159k
    Py_ssize_t len;
7137
159k
    PyObject *restuple;
7138
159k
    PyObject *resunicode;
7139
7140
159k
    if (*errorHandler == NULL) {
7141
159k
        *errorHandler = PyCodec_LookupError(errors);
7142
159k
        if (*errorHandler == NULL)
7143
0
            return NULL;
7144
159k
    }
7145
7146
159k
    len = PyUnicode_GET_LENGTH(unicode);
7147
7148
159k
    make_encode_exception(exceptionObject,
7149
159k
                          encoding, unicode, startpos, endpos, reason);
7150
159k
    if (*exceptionObject == NULL)
7151
0
        return NULL;
7152
7153
159k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7154
159k
    if (restuple == NULL)
7155
159k
        return NULL;
7156
0
    if (!PyTuple_Check(restuple)) {
7157
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7158
0
        Py_DECREF(restuple);
7159
0
        return NULL;
7160
0
    }
7161
0
    if (!PyArg_ParseTuple(restuple, argparse,
7162
0
                          &resunicode, newpos)) {
7163
0
        Py_DECREF(restuple);
7164
0
        return NULL;
7165
0
    }
7166
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7167
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7168
0
        Py_DECREF(restuple);
7169
0
        return NULL;
7170
0
    }
7171
0
    if (*newpos<0)
7172
0
        *newpos = len + *newpos;
7173
0
    if (*newpos<0 || *newpos>len) {
7174
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7175
0
        Py_DECREF(restuple);
7176
0
        return NULL;
7177
0
    }
7178
0
    Py_INCREF(resunicode);
7179
0
    Py_DECREF(restuple);
7180
0
    return resunicode;
7181
0
}
7182
7183
static PyObject *
7184
unicode_encode_ucs1(PyObject *unicode,
7185
                    const char *errors,
7186
                    const Py_UCS4 limit)
7187
590k
{
7188
    /* input state */
7189
590k
    Py_ssize_t pos=0, size;
7190
590k
    int kind;
7191
590k
    const void *data;
7192
590k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7193
590k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7194
590k
    PyObject *error_handler_obj = NULL;
7195
590k
    PyObject *exc = NULL;
7196
590k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7197
590k
    PyObject *rep = NULL;
7198
7199
590k
    size = PyUnicode_GET_LENGTH(unicode);
7200
590k
    kind = PyUnicode_KIND(unicode);
7201
590k
    data = PyUnicode_DATA(unicode);
7202
    /* allocate enough for a simple encoding without
7203
       replacements, if we need more, we'll resize */
7204
590k
    if (size == 0)
7205
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7206
7207
    /* output object */
7208
590k
    PyBytesWriter *writer = PyBytesWriter_Create(size);
7209
590k
    if (writer == NULL) {
7210
0
        return NULL;
7211
0
    }
7212
    /* pointer into the output */
7213
590k
    char *str = PyBytesWriter_GetData(writer);
7214
7215
4.10M
    while (pos < size) {
7216
4.10M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7217
7218
        /* can we encode this? */
7219
4.10M
        if (ch < limit) {
7220
            /* no overflow check, because we know that the space is enough */
7221
3.51M
            *str++ = (char)ch;
7222
3.51M
            ++pos;
7223
3.51M
        }
7224
591k
        else {
7225
591k
            Py_ssize_t newpos, i;
7226
            /* startpos for collecting unencodable chars */
7227
591k
            Py_ssize_t collstart = pos;
7228
591k
            Py_ssize_t collend = collstart + 1;
7229
            /* find all unecodable characters */
7230
7231
2.49M
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7232
1.90M
                ++collend;
7233
7234
            /* Only overallocate the buffer if it's not the last write */
7235
591k
            writer->overallocate = (collend < size);
7236
7237
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7238
591k
            if (error_handler == _Py_ERROR_UNKNOWN)
7239
590k
                error_handler = _Py_GetErrorHandler(errors);
7240
7241
591k
            switch (error_handler) {
7242
573k
            case _Py_ERROR_STRICT:
7243
573k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7244
573k
                goto onError;
7245
7246
6.11k
            case _Py_ERROR_REPLACE:
7247
6.11k
                memset(str, '?', collend - collstart);
7248
6.11k
                str += (collend - collstart);
7249
6.11k
                _Py_FALLTHROUGH;
7250
6.11k
            case _Py_ERROR_IGNORE:
7251
6.11k
                pos = collend;
7252
6.11k
                break;
7253
7254
0
            case _Py_ERROR_BACKSLASHREPLACE:
7255
                /* subtract preallocated bytes */
7256
0
                writer->size -= (collend - collstart);
7257
0
                str = backslashreplace(writer, str,
7258
0
                                       unicode, collstart, collend);
7259
0
                if (str == NULL)
7260
0
                    goto onError;
7261
0
                pos = collend;
7262
0
                break;
7263
7264
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7265
                /* subtract preallocated bytes */
7266
0
                writer->size -= (collend - collstart);
7267
0
                str = xmlcharrefreplace(writer, str,
7268
0
                                        unicode, collstart, collend);
7269
0
                if (str == NULL)
7270
0
                    goto onError;
7271
0
                pos = collend;
7272
0
                break;
7273
7274
11.5k
            case _Py_ERROR_SURROGATEESCAPE:
7275
11.5k
                for (i = collstart; i < collend; ++i) {
7276
11.5k
                    ch = PyUnicode_READ(kind, data, i);
7277
11.5k
                    if (ch < 0xdc80 || 0xdcff < ch) {
7278
                        /* Not a UTF-8b surrogate */
7279
11.5k
                        break;
7280
11.5k
                    }
7281
0
                    *str++ = (char)(ch - 0xdc00);
7282
0
                    ++pos;
7283
0
                }
7284
11.5k
                if (i >= collend)
7285
0
                    break;
7286
11.5k
                collstart = pos;
7287
11.5k
                assert(collstart != collend);
7288
11.5k
                _Py_FALLTHROUGH;
7289
7290
11.5k
            default:
7291
11.5k
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7292
11.5k
                                                       encoding, reason, unicode, &exc,
7293
11.5k
                                                       collstart, collend, &newpos);
7294
11.5k
                if (rep == NULL)
7295
11.5k
                    goto onError;
7296
7297
0
                if (newpos < collstart) {
7298
0
                    writer->overallocate = 1;
7299
0
                    str = PyBytesWriter_GrowAndUpdatePointer(writer,
7300
0
                                                             collstart - newpos,
7301
0
                                                             str);
7302
0
                    if (str == NULL) {
7303
0
                        goto onError;
7304
0
                    }
7305
0
                }
7306
0
                else {
7307
                    /* subtract preallocated bytes */
7308
0
                    writer->size -= newpos - collstart;
7309
                    /* Only overallocate the buffer if it's not the last write */
7310
0
                    writer->overallocate = (newpos < size);
7311
0
                }
7312
7313
0
                char *rep_str;
7314
0
                Py_ssize_t rep_len;
7315
0
                if (PyBytes_Check(rep)) {
7316
                    /* Directly copy bytes result to output. */
7317
0
                    rep_str = PyBytes_AS_STRING(rep);
7318
0
                    rep_len = PyBytes_GET_SIZE(rep);
7319
0
                }
7320
0
                else {
7321
0
                    assert(PyUnicode_Check(rep));
7322
7323
0
                    if (limit == 256 ?
7324
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7325
0
                        !PyUnicode_IS_ASCII(rep))
7326
0
                    {
7327
                        /* Not all characters are smaller than limit */
7328
0
                        raise_encode_exception(&exc, encoding, unicode,
7329
0
                                               collstart, collend, reason);
7330
0
                        goto onError;
7331
0
                    }
7332
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7333
0
                    rep_str = PyUnicode_DATA(rep);
7334
0
                    rep_len = PyUnicode_GET_LENGTH(rep);
7335
0
                }
7336
7337
0
                str = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, str);
7338
0
                if (str == NULL) {
7339
0
                    goto onError;
7340
0
                }
7341
0
                memcpy(str, rep_str, rep_len);
7342
0
                str += rep_len;
7343
7344
0
                pos = newpos;
7345
0
                Py_CLEAR(rep);
7346
591k
            }
7347
7348
            /* If overallocation was disabled, ensure that it was the last
7349
               write. Otherwise, we missed an optimization */
7350
591k
            assert(writer->overallocate || pos == size);
7351
6.11k
        }
7352
4.10M
    }
7353
7354
5.83k
    Py_XDECREF(error_handler_obj);
7355
5.83k
    Py_XDECREF(exc);
7356
5.83k
    return PyBytesWriter_FinishWithPointer(writer, str);
7357
7358
585k
  onError:
7359
585k
    Py_XDECREF(rep);
7360
585k
    PyBytesWriter_Discard(writer);
7361
585k
    Py_XDECREF(error_handler_obj);
7362
585k
    Py_XDECREF(exc);
7363
585k
    return NULL;
7364
590k
}
7365
7366
PyObject *
7367
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7368
10
{
7369
10
    if (!PyUnicode_Check(unicode)) {
7370
0
        PyErr_BadArgument();
7371
0
        return NULL;
7372
0
    }
7373
    /* Fast path: if it is a one-byte string, construct
7374
       bytes object directly. */
7375
10
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7376
10
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7377
10
                                         PyUnicode_GET_LENGTH(unicode));
7378
    /* Non-Latin-1 characters present. Defer to above function to
7379
       raise the exception. */
7380
0
    return unicode_encode_ucs1(unicode, errors, 256);
7381
10
}
7382
7383
PyObject*
7384
PyUnicode_AsLatin1String(PyObject *unicode)
7385
0
{
7386
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7387
0
}
7388
7389
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7390
7391
PyObject *
7392
PyUnicode_DecodeASCII(const char *s,
7393
                      Py_ssize_t size,
7394
                      const char *errors)
7395
11.7M
{
7396
11.7M
    const char *starts = s;
7397
11.7M
    const char *e = s + size;
7398
11.7M
    PyObject *error_handler_obj = NULL;
7399
11.7M
    PyObject *exc = NULL;
7400
11.7M
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7401
7402
11.7M
    if (size == 0)
7403
0
        _Py_RETURN_UNICODE_EMPTY();
7404
7405
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7406
11.7M
    if (size == 1 && (unsigned char)s[0] < 128) {
7407
522k
        return get_latin1_char((unsigned char)s[0]);
7408
522k
    }
7409
7410
    // Shortcut for simple case
7411
11.1M
    PyObject *u = PyUnicode_New(size, 127);
7412
11.1M
    if (u == NULL) {
7413
0
        return NULL;
7414
0
    }
7415
11.1M
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7416
11.1M
    if (outpos == size) {
7417
8.86M
        return u;
7418
8.86M
    }
7419
7420
2.33M
    _PyUnicodeWriter writer;
7421
2.33M
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7422
2.33M
    writer.pos = outpos;
7423
7424
2.33M
    s += outpos;
7425
2.33M
    int kind = writer.kind;
7426
2.33M
    void *data = writer.data;
7427
2.33M
    Py_ssize_t startinpos, endinpos;
7428
7429
20.9M
    while (s < e) {
7430
20.8M
        unsigned char c = (unsigned char)*s;
7431
20.8M
        if (c < 128) {
7432
5.85M
            PyUnicode_WRITE(kind, data, writer.pos, c);
7433
5.85M
            writer.pos++;
7434
5.85M
            ++s;
7435
5.85M
            continue;
7436
5.85M
        }
7437
7438
        /* byte outsize range 0x00..0x7f: call the error handler */
7439
7440
14.9M
        if (error_handler == _Py_ERROR_UNKNOWN)
7441
2.33M
            error_handler = _Py_GetErrorHandler(errors);
7442
7443
14.9M
        switch (error_handler)
7444
14.9M
        {
7445
898k
        case _Py_ERROR_REPLACE:
7446
12.7M
        case _Py_ERROR_SURROGATEESCAPE:
7447
            /* Fast-path: the error handler only writes one character,
7448
               but we may switch to UCS2 at the first write */
7449
12.7M
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7450
0
                goto onError;
7451
12.7M
            kind = writer.kind;
7452
12.7M
            data = writer.data;
7453
7454
12.7M
            if (error_handler == _Py_ERROR_REPLACE)
7455
898k
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7456
11.8M
            else
7457
11.8M
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7458
12.7M
            writer.pos++;
7459
12.7M
            ++s;
7460
12.7M
            break;
7461
7462
0
        case _Py_ERROR_IGNORE:
7463
0
            ++s;
7464
0
            break;
7465
7466
2.18M
        default:
7467
2.18M
            startinpos = s-starts;
7468
2.18M
            endinpos = startinpos + 1;
7469
2.18M
            if (unicode_decode_call_errorhandler_writer(
7470
2.18M
                    errors, &error_handler_obj,
7471
2.18M
                    "ascii", "ordinal not in range(128)",
7472
2.18M
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7473
2.18M
                    &writer))
7474
2.18M
                goto onError;
7475
0
            kind = writer.kind;
7476
0
            data = writer.data;
7477
14.9M
        }
7478
14.9M
    }
7479
151k
    Py_XDECREF(error_handler_obj);
7480
151k
    Py_XDECREF(exc);
7481
151k
    return _PyUnicodeWriter_Finish(&writer);
7482
7483
2.18M
  onError:
7484
2.18M
    _PyUnicodeWriter_Dealloc(&writer);
7485
2.18M
    Py_XDECREF(error_handler_obj);
7486
2.18M
    Py_XDECREF(exc);
7487
2.18M
    return NULL;
7488
2.33M
}
7489
7490
PyObject *
7491
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7492
948k
{
7493
948k
    if (!PyUnicode_Check(unicode)) {
7494
0
        PyErr_BadArgument();
7495
0
        return NULL;
7496
0
    }
7497
    /* Fast path: if it is an ASCII-only string, construct bytes object
7498
       directly. Else defer to above function to raise the exception. */
7499
948k
    if (PyUnicode_IS_ASCII(unicode))
7500
357k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7501
357k
                                         PyUnicode_GET_LENGTH(unicode));
7502
590k
    return unicode_encode_ucs1(unicode, errors, 128);
7503
948k
}
7504
7505
PyObject *
7506
PyUnicode_AsASCIIString(PyObject *unicode)
7507
119k
{
7508
119k
    return _PyUnicode_AsASCIIString(unicode, NULL);
7509
119k
}
7510
7511
#ifdef MS_WINDOWS
7512
7513
/* --- MBCS codecs for Windows -------------------------------------------- */
7514
7515
#if SIZEOF_INT < SIZEOF_SIZE_T
7516
#define NEED_RETRY
7517
#endif
7518
7519
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7520
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7521
   both cases also and avoids partial characters overrunning the
7522
   length limit in MultiByteToWideChar on Windows */
7523
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7524
7525
#ifndef WC_ERR_INVALID_CHARS
7526
#  define WC_ERR_INVALID_CHARS 0x0080
7527
#endif
7528
7529
static const char*
7530
code_page_name(UINT code_page, PyObject **obj)
7531
{
7532
    *obj = NULL;
7533
    if (code_page == CP_ACP)
7534
        return "mbcs";
7535
7536
    *obj = PyBytes_FromFormat("cp%u", code_page);
7537
    if (*obj == NULL)
7538
        return NULL;
7539
    return PyBytes_AS_STRING(*obj);
7540
}
7541
7542
static DWORD
7543
decode_code_page_flags(UINT code_page)
7544
{
7545
    if (code_page == CP_UTF7) {
7546
        /* The CP_UTF7 decoder only supports flags=0 */
7547
        return 0;
7548
    }
7549
    else
7550
        return MB_ERR_INVALID_CHARS;
7551
}
7552
7553
/*
7554
 * Decode a byte string from a Windows code page into unicode object in strict
7555
 * mode.
7556
 *
7557
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7558
 * OSError and returns -1 on other error.
7559
 */
7560
static int
7561
decode_code_page_strict(UINT code_page,
7562
                        wchar_t **buf,
7563
                        Py_ssize_t *bufsize,
7564
                        const char *in,
7565
                        int insize)
7566
{
7567
    DWORD flags = MB_ERR_INVALID_CHARS;
7568
    wchar_t *out;
7569
    DWORD outsize;
7570
7571
    /* First get the size of the result */
7572
    assert(insize > 0);
7573
    while ((outsize = MultiByteToWideChar(code_page, flags,
7574
                                          in, insize, NULL, 0)) <= 0)
7575
    {
7576
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7577
            goto error;
7578
        }
7579
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7580
        flags = 0;
7581
    }
7582
7583
    /* Extend a wchar_t* buffer */
7584
    Py_ssize_t n = *bufsize;   /* Get the current length */
7585
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7586
        return -1;
7587
    }
7588
    out = *buf + n;
7589
7590
    /* Do the conversion */
7591
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7592
    if (outsize <= 0)
7593
        goto error;
7594
    return insize;
7595
7596
error:
7597
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7598
        return -2;
7599
    PyErr_SetFromWindowsErr(0);
7600
    return -1;
7601
}
7602
7603
/*
7604
 * Decode a byte string from a code page into unicode object with an error
7605
 * handler.
7606
 *
7607
 * Returns consumed size if succeed, or raise an OSError or
7608
 * UnicodeDecodeError exception and returns -1 on error.
7609
 */
7610
static int
7611
decode_code_page_errors(UINT code_page,
7612
                        wchar_t **buf,
7613
                        Py_ssize_t *bufsize,
7614
                        const char *in, const int size,
7615
                        const char *errors, int final)
7616
{
7617
    const char *startin = in;
7618
    const char *endin = in + size;
7619
    DWORD flags = MB_ERR_INVALID_CHARS;
7620
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7621
       2000 English version of the message. */
7622
    const char *reason = "No mapping for the Unicode character exists "
7623
                         "in the target code page.";
7624
    /* each step cannot decode more than 1 character, but a character can be
7625
       represented as a surrogate pair */
7626
    wchar_t buffer[2], *out;
7627
    int insize;
7628
    Py_ssize_t outsize;
7629
    PyObject *errorHandler = NULL;
7630
    PyObject *exc = NULL;
7631
    PyObject *encoding_obj = NULL;
7632
    const char *encoding;
7633
    DWORD err;
7634
    int ret = -1;
7635
7636
    assert(size > 0);
7637
7638
    encoding = code_page_name(code_page, &encoding_obj);
7639
    if (encoding == NULL)
7640
        return -1;
7641
7642
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7643
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7644
           UnicodeDecodeError. */
7645
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7646
        if (exc != NULL) {
7647
            PyCodec_StrictErrors(exc);
7648
            Py_CLEAR(exc);
7649
        }
7650
        goto error;
7651
    }
7652
7653
    /* Extend a wchar_t* buffer */
7654
    Py_ssize_t n = *bufsize;   /* Get the current length */
7655
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7656
        PyErr_NoMemory();
7657
        goto error;
7658
    }
7659
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7660
        goto error;
7661
    }
7662
    out = *buf + n;
7663
7664
    /* Decode the byte string character per character */
7665
    while (in < endin)
7666
    {
7667
        /* Decode a character */
7668
        insize = 1;
7669
        do
7670
        {
7671
            outsize = MultiByteToWideChar(code_page, flags,
7672
                                          in, insize,
7673
                                          buffer, Py_ARRAY_LENGTH(buffer));
7674
            if (outsize > 0)
7675
                break;
7676
            err = GetLastError();
7677
            if (err == ERROR_INVALID_FLAGS && flags) {
7678
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7679
                flags = 0;
7680
                continue;
7681
            }
7682
            if (err != ERROR_NO_UNICODE_TRANSLATION
7683
                && err != ERROR_INSUFFICIENT_BUFFER)
7684
            {
7685
                PyErr_SetFromWindowsErr(err);
7686
                goto error;
7687
            }
7688
            insize++;
7689
        }
7690
        /* 4=maximum length of a UTF-8 sequence */
7691
        while (insize <= 4 && (in + insize) <= endin);
7692
7693
        if (outsize <= 0) {
7694
            Py_ssize_t startinpos, endinpos, outpos;
7695
7696
            /* last character in partial decode? */
7697
            if (in + insize >= endin && !final)
7698
                break;
7699
7700
            startinpos = in - startin;
7701
            endinpos = startinpos + 1;
7702
            outpos = out - *buf;
7703
            if (unicode_decode_call_errorhandler_wchar(
7704
                    errors, &errorHandler,
7705
                    encoding, reason,
7706
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7707
                    buf, bufsize, &outpos))
7708
            {
7709
                goto error;
7710
            }
7711
            out = *buf + outpos;
7712
        }
7713
        else {
7714
            in += insize;
7715
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7716
            out += outsize;
7717
        }
7718
    }
7719
7720
    /* Shrink the buffer */
7721
    assert(out - *buf <= *bufsize);
7722
    *bufsize = out - *buf;
7723
    /* (in - startin) <= size and size is an int */
7724
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7725
7726
error:
7727
    Py_XDECREF(encoding_obj);
7728
    Py_XDECREF(errorHandler);
7729
    Py_XDECREF(exc);
7730
    return ret;
7731
}
7732
7733
static PyObject *
7734
decode_code_page_stateful(int code_page,
7735
                          const char *s, Py_ssize_t size,
7736
                          const char *errors, Py_ssize_t *consumed)
7737
{
7738
    wchar_t *buf = NULL;
7739
    Py_ssize_t bufsize = 0;
7740
    int chunk_size, final, converted, done;
7741
7742
    if (code_page < 0) {
7743
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7744
        return NULL;
7745
    }
7746
    if (size < 0) {
7747
        PyErr_BadInternalCall();
7748
        return NULL;
7749
    }
7750
7751
    if (consumed)
7752
        *consumed = 0;
7753
7754
    do
7755
    {
7756
#ifdef NEED_RETRY
7757
        if (size > DECODING_CHUNK_SIZE) {
7758
            chunk_size = DECODING_CHUNK_SIZE;
7759
            final = 0;
7760
            done = 0;
7761
        }
7762
        else
7763
#endif
7764
        {
7765
            chunk_size = (int)size;
7766
            final = (consumed == NULL);
7767
            done = 1;
7768
        }
7769
7770
        if (chunk_size == 0 && done) {
7771
            if (buf != NULL)
7772
                break;
7773
            _Py_RETURN_UNICODE_EMPTY();
7774
        }
7775
7776
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7777
                                            s, chunk_size);
7778
        if (converted == -2)
7779
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7780
                                                s, chunk_size,
7781
                                                errors, final);
7782
        assert(converted != 0 || done);
7783
7784
        if (converted < 0) {
7785
            PyMem_Free(buf);
7786
            return NULL;
7787
        }
7788
7789
        if (consumed)
7790
            *consumed += converted;
7791
7792
        s += converted;
7793
        size -= converted;
7794
    } while (!done);
7795
7796
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7797
    PyMem_Free(buf);
7798
    return v;
7799
}
7800
7801
PyObject *
7802
PyUnicode_DecodeCodePageStateful(int code_page,
7803
                                 const char *s,
7804
                                 Py_ssize_t size,
7805
                                 const char *errors,
7806
                                 Py_ssize_t *consumed)
7807
{
7808
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7809
}
7810
7811
PyObject *
7812
PyUnicode_DecodeMBCSStateful(const char *s,
7813
                             Py_ssize_t size,
7814
                             const char *errors,
7815
                             Py_ssize_t *consumed)
7816
{
7817
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7818
}
7819
7820
PyObject *
7821
PyUnicode_DecodeMBCS(const char *s,
7822
                     Py_ssize_t size,
7823
                     const char *errors)
7824
{
7825
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7826
}
7827
7828
static DWORD
7829
encode_code_page_flags(UINT code_page, const char *errors)
7830
{
7831
    if (code_page == CP_UTF8) {
7832
        return WC_ERR_INVALID_CHARS;
7833
    }
7834
    else if (code_page == CP_UTF7) {
7835
        /* CP_UTF7 only supports flags=0 */
7836
        return 0;
7837
    }
7838
    else {
7839
        if (errors != NULL && strcmp(errors, "replace") == 0)
7840
            return 0;
7841
        else
7842
            return WC_NO_BEST_FIT_CHARS;
7843
    }
7844
}
7845
7846
/*
7847
 * Encode a Unicode string to a Windows code page into a byte string in strict
7848
 * mode.
7849
 *
7850
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7851
 * an OSError and returns -1 on other error.
7852
 */
7853
static int
7854
encode_code_page_strict(UINT code_page, PyBytesWriter **writer,
7855
                        PyObject *unicode, Py_ssize_t offset, int len,
7856
                        const char* errors)
7857
{
7858
    BOOL usedDefaultChar = FALSE;
7859
    BOOL *pusedDefaultChar = &usedDefaultChar;
7860
    int outsize;
7861
    wchar_t *p;
7862
    Py_ssize_t size;
7863
    const DWORD flags = encode_code_page_flags(code_page, NULL);
7864
    char *out;
7865
    /* Create a substring so that we can get the UTF-16 representation
7866
       of just the slice under consideration. */
7867
    PyObject *substring;
7868
    int ret = -1;
7869
7870
    assert(len > 0);
7871
7872
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7873
        pusedDefaultChar = &usedDefaultChar;
7874
    else
7875
        pusedDefaultChar = NULL;
7876
7877
    substring = PyUnicode_Substring(unicode, offset, offset+len);
7878
    if (substring == NULL)
7879
        return -1;
7880
    p = PyUnicode_AsWideCharString(substring, &size);
7881
    Py_CLEAR(substring);
7882
    if (p == NULL) {
7883
        return -1;
7884
    }
7885
    assert(size <= INT_MAX);
7886
7887
    /* First get the size of the result */
7888
    outsize = WideCharToMultiByte(code_page, flags,
7889
                                  p, (int)size,
7890
                                  NULL, 0,
7891
                                  NULL, pusedDefaultChar);
7892
    if (outsize <= 0)
7893
        goto error;
7894
    /* If we used a default char, then we failed! */
7895
    if (pusedDefaultChar && *pusedDefaultChar) {
7896
        ret = -2;
7897
        goto done;
7898
    }
7899
7900
    if (*writer == NULL) {
7901
        /* Create string object */
7902
        *writer = PyBytesWriter_Create(outsize);
7903
        if (*writer == NULL) {
7904
            goto done;
7905
        }
7906
        out = PyBytesWriter_GetData(*writer);
7907
    }
7908
    else {
7909
        /* Extend string object */
7910
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
7911
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
7912
            goto done;
7913
        }
7914
        out = (char*)PyBytesWriter_GetData(*writer) + n;
7915
    }
7916
7917
    /* Do the conversion */
7918
    outsize = WideCharToMultiByte(code_page, flags,
7919
                                  p, (int)size,
7920
                                  out, outsize,
7921
                                  NULL, pusedDefaultChar);
7922
    if (outsize <= 0)
7923
        goto error;
7924
    if (pusedDefaultChar && *pusedDefaultChar) {
7925
        ret = -2;
7926
        goto done;
7927
    }
7928
    ret = 0;
7929
7930
done:
7931
    PyMem_Free(p);
7932
    return ret;
7933
7934
error:
7935
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7936
        ret = -2;
7937
        goto done;
7938
    }
7939
    PyErr_SetFromWindowsErr(0);
7940
    goto done;
7941
}
7942
7943
/*
7944
 * Encode a Unicode string to a Windows code page into a byte string using an
7945
 * error handler.
7946
 *
7947
 * Returns consumed characters if succeed, or raise an OSError and returns
7948
 * -1 on other error.
7949
 */
7950
static int
7951
encode_code_page_errors(UINT code_page, PyBytesWriter **writer,
7952
                        PyObject *unicode, Py_ssize_t unicode_offset,
7953
                        Py_ssize_t insize, const char* errors)
7954
{
7955
    const DWORD flags = encode_code_page_flags(code_page, errors);
7956
    Py_ssize_t pos = unicode_offset;
7957
    Py_ssize_t endin = unicode_offset + insize;
7958
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7959
       2000 English version of the message. */
7960
    const char *reason = "invalid character";
7961
    /* 4=maximum length of a UTF-8 sequence */
7962
    char buffer[4];
7963
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7964
    Py_ssize_t outsize;
7965
    char *out;
7966
    PyObject *errorHandler = NULL;
7967
    PyObject *exc = NULL;
7968
    PyObject *encoding_obj = NULL;
7969
    const char *encoding;
7970
    Py_ssize_t newpos;
7971
    PyObject *rep;
7972
    int ret = -1;
7973
7974
    assert(insize > 0);
7975
7976
    encoding = code_page_name(code_page, &encoding_obj);
7977
    if (encoding == NULL)
7978
        return -1;
7979
7980
    if (errors == NULL || strcmp(errors, "strict") == 0) {
7981
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7982
           then we raise a UnicodeEncodeError. */
7983
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7984
        if (exc != NULL) {
7985
            PyCodec_StrictErrors(exc);
7986
            Py_DECREF(exc);
7987
        }
7988
        Py_XDECREF(encoding_obj);
7989
        return -1;
7990
    }
7991
7992
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7993
        pusedDefaultChar = &usedDefaultChar;
7994
    else
7995
        pusedDefaultChar = NULL;
7996
7997
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7998
        PyErr_NoMemory();
7999
        goto error;
8000
    }
8001
    outsize = insize * Py_ARRAY_LENGTH(buffer);
8002
8003
    if (*writer == NULL) {
8004
        /* Create string object */
8005
        *writer = PyBytesWriter_Create(outsize);
8006
        if (*writer == NULL) {
8007
            goto error;
8008
        }
8009
        out = PyBytesWriter_GetData(*writer);
8010
    }
8011
    else {
8012
        /* Extend string object */
8013
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
8014
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
8015
            goto error;
8016
        }
8017
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8018
    }
8019
8020
    /* Encode the string character per character */
8021
    while (pos < endin)
8022
    {
8023
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8024
        wchar_t chars[2];
8025
        int charsize;
8026
        if (ch < 0x10000) {
8027
            chars[0] = (wchar_t)ch;
8028
            charsize = 1;
8029
        }
8030
        else {
8031
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8032
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8033
            charsize = 2;
8034
        }
8035
8036
        outsize = WideCharToMultiByte(code_page, flags,
8037
                                      chars, charsize,
8038
                                      buffer, Py_ARRAY_LENGTH(buffer),
8039
                                      NULL, pusedDefaultChar);
8040
        if (outsize > 0) {
8041
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8042
            {
8043
                pos++;
8044
                memcpy(out, buffer, outsize);
8045
                out += outsize;
8046
                continue;
8047
            }
8048
        }
8049
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8050
            PyErr_SetFromWindowsErr(0);
8051
            goto error;
8052
        }
8053
8054
        rep = unicode_encode_call_errorhandler(
8055
                  errors, &errorHandler, encoding, reason,
8056
                  unicode, &exc,
8057
                  pos, pos + 1, &newpos);
8058
        if (rep == NULL)
8059
            goto error;
8060
8061
        Py_ssize_t morebytes = pos - newpos;
8062
        if (PyBytes_Check(rep)) {
8063
            outsize = PyBytes_GET_SIZE(rep);
8064
            morebytes += outsize;
8065
            if (morebytes > 0) {
8066
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8067
                if (out == NULL) {
8068
                    Py_DECREF(rep);
8069
                    goto error;
8070
                }
8071
            }
8072
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8073
            out += outsize;
8074
        }
8075
        else {
8076
            Py_ssize_t i;
8077
            int kind;
8078
            const void *data;
8079
8080
            outsize = PyUnicode_GET_LENGTH(rep);
8081
            morebytes += outsize;
8082
            if (morebytes > 0) {
8083
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8084
                if (out == NULL) {
8085
                    Py_DECREF(rep);
8086
                    goto error;
8087
                }
8088
            }
8089
            kind = PyUnicode_KIND(rep);
8090
            data = PyUnicode_DATA(rep);
8091
            for (i=0; i < outsize; i++) {
8092
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8093
                if (ch > 127) {
8094
                    raise_encode_exception(&exc,
8095
                        encoding, unicode,
8096
                        pos, pos + 1,
8097
                        "unable to encode error handler result to ASCII");
8098
                    Py_DECREF(rep);
8099
                    goto error;
8100
                }
8101
                *out = (unsigned char)ch;
8102
                out++;
8103
            }
8104
        }
8105
        pos = newpos;
8106
        Py_DECREF(rep);
8107
    }
8108
    /* write a NUL byte */
8109
    *out = 0;
8110
    outsize = out - (char*)PyBytesWriter_GetData(*writer);
8111
    assert(outsize <= PyBytesWriter_GetSize(*writer));
8112
    if (PyBytesWriter_Resize(*writer, outsize) < 0) {
8113
        goto error;
8114
    }
8115
    ret = 0;
8116
8117
error:
8118
    Py_XDECREF(encoding_obj);
8119
    Py_XDECREF(errorHandler);
8120
    Py_XDECREF(exc);
8121
    return ret;
8122
}
8123
8124
8125
PyObject *
8126
PyUnicode_EncodeCodePage(int code_page,
8127
                         PyObject *unicode,
8128
                         const char *errors)
8129
{
8130
    Py_ssize_t len;
8131
    PyBytesWriter *writer = NULL;
8132
    Py_ssize_t offset;
8133
    int chunk_len, ret, done;
8134
8135
    if (!PyUnicode_Check(unicode)) {
8136
        PyErr_BadArgument();
8137
        return NULL;
8138
    }
8139
8140
    len = PyUnicode_GET_LENGTH(unicode);
8141
8142
    if (code_page < 0) {
8143
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8144
        return NULL;
8145
    }
8146
8147
    if (len == 0)
8148
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8149
8150
    offset = 0;
8151
    do
8152
    {
8153
#ifdef NEED_RETRY
8154
        if (len > DECODING_CHUNK_SIZE) {
8155
            chunk_len = DECODING_CHUNK_SIZE;
8156
            done = 0;
8157
        }
8158
        else
8159
#endif
8160
        {
8161
            chunk_len = (int)len;
8162
            done = 1;
8163
        }
8164
8165
        ret = encode_code_page_strict(code_page, &writer,
8166
                                      unicode, offset, chunk_len,
8167
                                      errors);
8168
        if (ret == -2)
8169
            ret = encode_code_page_errors(code_page, &writer,
8170
                                          unicode, offset,
8171
                                          chunk_len, errors);
8172
        if (ret < 0) {
8173
            PyBytesWriter_Discard(writer);
8174
            return NULL;
8175
        }
8176
8177
        offset += chunk_len;
8178
        len -= chunk_len;
8179
    } while (!done);
8180
8181
    return PyBytesWriter_Finish(writer);
8182
}
8183
8184
8185
PyObject *
8186
PyUnicode_AsMBCSString(PyObject *unicode)
8187
{
8188
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8189
}
8190
8191
#undef NEED_RETRY
8192
8193
#endif /* MS_WINDOWS */
8194
8195
/* --- Character Mapping Codec -------------------------------------------- */
8196
8197
static int
8198
charmap_decode_string(const char *s,
8199
                      Py_ssize_t size,
8200
                      PyObject *mapping,
8201
                      const char *errors,
8202
                      _PyUnicodeWriter *writer)
8203
661k
{
8204
661k
    const char *starts = s;
8205
661k
    const char *e;
8206
661k
    Py_ssize_t startinpos, endinpos;
8207
661k
    PyObject *errorHandler = NULL, *exc = NULL;
8208
661k
    Py_ssize_t maplen;
8209
661k
    int mapkind;
8210
661k
    const void *mapdata;
8211
661k
    Py_UCS4 x;
8212
661k
    unsigned char ch;
8213
8214
661k
    maplen = PyUnicode_GET_LENGTH(mapping);
8215
661k
    mapdata = PyUnicode_DATA(mapping);
8216
661k
    mapkind = PyUnicode_KIND(mapping);
8217
8218
661k
    e = s + size;
8219
8220
661k
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8221
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8222
         * is disabled in encoding aliases, latin1 is preferred because
8223
         * its implementation is faster. */
8224
134
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8225
134
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8226
134
        Py_UCS4 maxchar = writer->maxchar;
8227
8228
134
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8229
1.05M
        while (s < e) {
8230
1.05M
            ch = *s;
8231
1.05M
            x = mapdata_ucs1[ch];
8232
1.05M
            if (x > maxchar) {
8233
124
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8234
0
                    goto onError;
8235
124
                maxchar = writer->maxchar;
8236
124
                outdata = (Py_UCS1 *)writer->data;
8237
124
            }
8238
1.05M
            outdata[writer->pos] = x;
8239
1.05M
            writer->pos++;
8240
1.05M
            ++s;
8241
1.05M
        }
8242
134
        return 0;
8243
134
    }
8244
8245
768k
    while (s < e) {
8246
753k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8247
753k
            int outkind = writer->kind;
8248
753k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8249
753k
            if (outkind == PyUnicode_1BYTE_KIND) {
8250
695k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8251
695k
                Py_UCS4 maxchar = writer->maxchar;
8252
21.6M
                while (s < e) {
8253
20.9M
                    ch = *s;
8254
20.9M
                    x = mapdata_ucs2[ch];
8255
20.9M
                    if (x > maxchar)
8256
73.4k
                        goto Error;
8257
20.9M
                    outdata[writer->pos] = x;
8258
20.9M
                    writer->pos++;
8259
20.9M
                    ++s;
8260
20.9M
                }
8261
622k
                break;
8262
695k
            }
8263
57.8k
            else if (outkind == PyUnicode_2BYTE_KIND) {
8264
57.8k
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8265
32.0M
                while (s < e) {
8266
32.0M
                    ch = *s;
8267
32.0M
                    x = mapdata_ucs2[ch];
8268
32.0M
                    if (x == 0xFFFE)
8269
33.3k
                        goto Error;
8270
32.0M
                    outdata[writer->pos] = x;
8271
32.0M
                    writer->pos++;
8272
32.0M
                    ++s;
8273
32.0M
                }
8274
24.4k
                break;
8275
57.8k
            }
8276
753k
        }
8277
0
        ch = *s;
8278
8279
0
        if (ch < maplen)
8280
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8281
0
        else
8282
0
            x = 0xfffe; /* invalid value */
8283
106k
Error:
8284
106k
        if (x == 0xfffe)
8285
52.9k
        {
8286
            /* undefined mapping */
8287
52.9k
            startinpos = s-starts;
8288
52.9k
            endinpos = startinpos+1;
8289
52.9k
            if (unicode_decode_call_errorhandler_writer(
8290
52.9k
                    errors, &errorHandler,
8291
52.9k
                    "charmap", "character maps to <undefined>",
8292
52.9k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8293
52.9k
                    writer)) {
8294
18
                goto onError;
8295
18
            }
8296
52.9k
            continue;
8297
52.9k
        }
8298
8299
53.8k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8300
0
            goto onError;
8301
53.8k
        ++s;
8302
53.8k
    }
8303
661k
    Py_XDECREF(errorHandler);
8304
661k
    Py_XDECREF(exc);
8305
661k
    return 0;
8306
8307
18
onError:
8308
18
    Py_XDECREF(errorHandler);
8309
18
    Py_XDECREF(exc);
8310
18
    return -1;
8311
661k
}
8312
8313
static int
8314
charmap_decode_mapping(const char *s,
8315
                       Py_ssize_t size,
8316
                       PyObject *mapping,
8317
                       const char *errors,
8318
                       _PyUnicodeWriter *writer)
8319
0
{
8320
0
    const char *starts = s;
8321
0
    const char *e;
8322
0
    Py_ssize_t startinpos, endinpos;
8323
0
    PyObject *errorHandler = NULL, *exc = NULL;
8324
0
    unsigned char ch;
8325
0
    PyObject *key, *item = NULL;
8326
8327
0
    e = s + size;
8328
8329
0
    while (s < e) {
8330
0
        ch = *s;
8331
8332
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8333
0
        key = PyLong_FromLong((long)ch);
8334
0
        if (key == NULL)
8335
0
            goto onError;
8336
8337
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8338
0
        Py_DECREF(key);
8339
0
        if (rc == 0) {
8340
            /* No mapping found means: mapping is undefined. */
8341
0
            goto Undefined;
8342
0
        }
8343
0
        if (item == NULL) {
8344
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8345
                /* No mapping found means: mapping is undefined. */
8346
0
                PyErr_Clear();
8347
0
                goto Undefined;
8348
0
            } else
8349
0
                goto onError;
8350
0
        }
8351
8352
        /* Apply mapping */
8353
0
        if (item == Py_None)
8354
0
            goto Undefined;
8355
0
        if (PyLong_Check(item)) {
8356
0
            long value = PyLong_AsLong(item);
8357
0
            if (value == 0xFFFE)
8358
0
                goto Undefined;
8359
0
            if (value < 0 || value > MAX_UNICODE) {
8360
0
                PyErr_Format(PyExc_TypeError,
8361
0
                             "character mapping must be in range(0x%lx)",
8362
0
                             (unsigned long)MAX_UNICODE + 1);
8363
0
                goto onError;
8364
0
            }
8365
8366
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8367
0
                goto onError;
8368
0
        }
8369
0
        else if (PyUnicode_Check(item)) {
8370
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8371
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8372
0
                if (value == 0xFFFE)
8373
0
                    goto Undefined;
8374
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8375
0
                    goto onError;
8376
0
            }
8377
0
            else {
8378
0
                writer->overallocate = 1;
8379
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8380
0
                    goto onError;
8381
0
            }
8382
0
        }
8383
0
        else {
8384
            /* wrong return value */
8385
0
            PyErr_SetString(PyExc_TypeError,
8386
0
                            "character mapping must return integer, None or str");
8387
0
            goto onError;
8388
0
        }
8389
0
        Py_CLEAR(item);
8390
0
        ++s;
8391
0
        continue;
8392
8393
0
Undefined:
8394
        /* undefined mapping */
8395
0
        Py_CLEAR(item);
8396
0
        startinpos = s-starts;
8397
0
        endinpos = startinpos+1;
8398
0
        if (unicode_decode_call_errorhandler_writer(
8399
0
                errors, &errorHandler,
8400
0
                "charmap", "character maps to <undefined>",
8401
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8402
0
                writer)) {
8403
0
            goto onError;
8404
0
        }
8405
0
    }
8406
0
    Py_XDECREF(errorHandler);
8407
0
    Py_XDECREF(exc);
8408
0
    return 0;
8409
8410
0
onError:
8411
0
    Py_XDECREF(item);
8412
0
    Py_XDECREF(errorHandler);
8413
0
    Py_XDECREF(exc);
8414
0
    return -1;
8415
0
}
8416
8417
PyObject *
8418
PyUnicode_DecodeCharmap(const char *s,
8419
                        Py_ssize_t size,
8420
                        PyObject *mapping,
8421
                        const char *errors)
8422
661k
{
8423
661k
    _PyUnicodeWriter writer;
8424
8425
    /* Default to Latin-1 */
8426
661k
    if (mapping == NULL)
8427
21
        return PyUnicode_DecodeLatin1(s, size, errors);
8428
8429
661k
    if (size == 0)
8430
0
        _Py_RETURN_UNICODE_EMPTY();
8431
661k
    _PyUnicodeWriter_Init(&writer);
8432
661k
    writer.min_length = size;
8433
661k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8434
0
        goto onError;
8435
8436
661k
    if (PyUnicode_CheckExact(mapping)) {
8437
661k
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8438
18
            goto onError;
8439
661k
    }
8440
0
    else {
8441
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8442
0
            goto onError;
8443
0
    }
8444
661k
    return _PyUnicodeWriter_Finish(&writer);
8445
8446
18
  onError:
8447
18
    _PyUnicodeWriter_Dealloc(&writer);
8448
18
    return NULL;
8449
661k
}
8450
8451
/* Charmap encoding: the lookup table */
8452
8453
/*[clinic input]
8454
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8455
[clinic start generated code]*/
8456
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8457
8458
struct encoding_map {
8459
    PyObject_HEAD
8460
    unsigned char level1[32];
8461
    int count2, count3;
8462
    unsigned char level23[1];
8463
};
8464
8465
/*[clinic input]
8466
EncodingMap.size
8467
8468
Return the size (in bytes) of this object.
8469
[clinic start generated code]*/
8470
8471
static PyObject *
8472
EncodingMap_size_impl(struct encoding_map *self)
8473
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8474
0
{
8475
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8476
0
                           128*self->count3);
8477
0
}
8478
8479
static PyMethodDef encoding_map_methods[] = {
8480
    ENCODINGMAP_SIZE_METHODDEF
8481
    {NULL, NULL}
8482
};
8483
8484
static PyTypeObject EncodingMapType = {
8485
    PyVarObject_HEAD_INIT(NULL, 0)
8486
    .tp_name = "EncodingMap",
8487
    .tp_basicsize = sizeof(struct encoding_map),
8488
    /* methods */
8489
    .tp_flags = Py_TPFLAGS_DEFAULT,
8490
    .tp_methods = encoding_map_methods,
8491
};
8492
8493
PyObject*
8494
PyUnicode_BuildEncodingMap(PyObject* string)
8495
140
{
8496
140
    PyObject *result;
8497
140
    struct encoding_map *mresult;
8498
140
    int i;
8499
140
    int need_dict = 0;
8500
140
    unsigned char level1[32];
8501
140
    unsigned char level2[512];
8502
140
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8503
140
    int count2 = 0, count3 = 0;
8504
140
    int kind;
8505
140
    const void *data;
8506
140
    int length;
8507
140
    Py_UCS4 ch;
8508
8509
140
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8510
0
        PyErr_BadArgument();
8511
0
        return NULL;
8512
0
    }
8513
140
    kind = PyUnicode_KIND(string);
8514
140
    data = PyUnicode_DATA(string);
8515
140
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8516
140
    memset(level1, 0xFF, sizeof level1);
8517
140
    memset(level2, 0xFF, sizeof level2);
8518
8519
    /* If there isn't a one-to-one mapping of NULL to \0,
8520
       or if there are non-BMP characters, we need to use
8521
       a mapping dictionary. */
8522
140
    if (PyUnicode_READ(kind, data, 0) != 0)
8523
0
        need_dict = 1;
8524
35.8k
    for (i = 1; i < length; i++) {
8525
35.7k
        int l1, l2;
8526
35.7k
        ch = PyUnicode_READ(kind, data, i);
8527
35.7k
        if (ch == 0 || ch > 0xFFFF) {
8528
0
            need_dict = 1;
8529
0
            break;
8530
0
        }
8531
35.7k
        if (ch == 0xFFFE)
8532
            /* unmapped character */
8533
986
            continue;
8534
34.7k
        l1 = ch >> 11;
8535
34.7k
        l2 = ch >> 7;
8536
34.7k
        if (level1[l1] == 0xFF)
8537
256
            level1[l1] = count2++;
8538
34.7k
        if (level2[l2] == 0xFF)
8539
761
            level2[l2] = count3++;
8540
34.7k
    }
8541
8542
140
    if (count2 >= 0xFF || count3 >= 0xFF)
8543
0
        need_dict = 1;
8544
8545
140
    if (need_dict) {
8546
0
        PyObject *result = PyDict_New();
8547
0
        if (!result)
8548
0
            return NULL;
8549
0
        for (i = 0; i < length; i++) {
8550
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8551
0
            PyObject *key = PyLong_FromLong(c);
8552
0
            if (key == NULL) {
8553
0
                Py_DECREF(result);
8554
0
                return NULL;
8555
0
            }
8556
0
            PyObject *value = PyLong_FromLong(i);
8557
0
            if (value == NULL) {
8558
0
                Py_DECREF(key);
8559
0
                Py_DECREF(result);
8560
0
                return NULL;
8561
0
            }
8562
0
            int rc = PyDict_SetItem(result, key, value);
8563
0
            Py_DECREF(key);
8564
0
            Py_DECREF(value);
8565
0
            if (rc < 0) {
8566
0
                Py_DECREF(result);
8567
0
                return NULL;
8568
0
            }
8569
0
        }
8570
0
        return result;
8571
0
    }
8572
8573
    /* Create a three-level trie */
8574
140
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8575
140
                             16*count2 + 128*count3 - 1);
8576
140
    if (!result) {
8577
0
        return PyErr_NoMemory();
8578
0
    }
8579
8580
140
    _PyObject_Init(result, &EncodingMapType);
8581
140
    mresult = (struct encoding_map*)result;
8582
140
    mresult->count2 = count2;
8583
140
    mresult->count3 = count3;
8584
140
    mlevel1 = mresult->level1;
8585
140
    mlevel2 = mresult->level23;
8586
140
    mlevel3 = mresult->level23 + 16*count2;
8587
140
    memcpy(mlevel1, level1, 32);
8588
140
    memset(mlevel2, 0xFF, 16*count2);
8589
140
    memset(mlevel3, 0, 128*count3);
8590
140
    count3 = 0;
8591
35.8k
    for (i = 1; i < length; i++) {
8592
35.7k
        int o1, o2, o3, i2, i3;
8593
35.7k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8594
35.7k
        if (ch == 0xFFFE)
8595
            /* unmapped character */
8596
986
            continue;
8597
34.7k
        o1 = ch>>11;
8598
34.7k
        o2 = (ch>>7) & 0xF;
8599
34.7k
        i2 = 16*mlevel1[o1] + o2;
8600
34.7k
        if (mlevel2[i2] == 0xFF)
8601
761
            mlevel2[i2] = count3++;
8602
34.7k
        o3 = ch & 0x7F;
8603
34.7k
        i3 = 128*mlevel2[i2] + o3;
8604
34.7k
        mlevel3[i3] = i;
8605
34.7k
    }
8606
140
    return result;
8607
140
}
8608
8609
static int
8610
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8611
0
{
8612
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8613
0
    int l1 = c>>11;
8614
0
    int l2 = (c>>7) & 0xF;
8615
0
    int l3 = c & 0x7F;
8616
0
    int i;
8617
8618
0
    if (c > 0xFFFF)
8619
0
        return -1;
8620
0
    if (c == 0)
8621
0
        return 0;
8622
    /* level 1*/
8623
0
    i = map->level1[l1];
8624
0
    if (i == 0xFF) {
8625
0
        return -1;
8626
0
    }
8627
    /* level 2*/
8628
0
    i = map->level23[16*i+l2];
8629
0
    if (i == 0xFF) {
8630
0
        return -1;
8631
0
    }
8632
    /* level 3 */
8633
0
    i = map->level23[16*map->count2 + 128*i + l3];
8634
0
    if (i == 0) {
8635
0
        return -1;
8636
0
    }
8637
0
    return i;
8638
0
}
8639
8640
/* Lookup the character in the mapping.
8641
   On success, return PyLong, PyBytes or None (if the character can't be found).
8642
   If the result is PyLong, put its value in replace.
8643
   On error, return NULL.
8644
   */
8645
static PyObject *
8646
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8647
0
{
8648
0
    PyObject *w = PyLong_FromLong((long)c);
8649
0
    PyObject *x;
8650
8651
0
    if (w == NULL)
8652
0
        return NULL;
8653
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8654
0
    Py_DECREF(w);
8655
0
    if (rc == 0) {
8656
        /* No mapping found means: mapping is undefined. */
8657
0
        Py_RETURN_NONE;
8658
0
    }
8659
0
    if (x == NULL) {
8660
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8661
            /* No mapping found means: mapping is undefined. */
8662
0
            PyErr_Clear();
8663
0
            Py_RETURN_NONE;
8664
0
        } else
8665
0
            return NULL;
8666
0
    }
8667
0
    else if (x == Py_None)
8668
0
        return x;
8669
0
    else if (PyLong_Check(x)) {
8670
0
        long value = PyLong_AsLong(x);
8671
0
        if (value < 0 || value > 255) {
8672
0
            PyErr_SetString(PyExc_TypeError,
8673
0
                            "character mapping must be in range(256)");
8674
0
            Py_DECREF(x);
8675
0
            return NULL;
8676
0
        }
8677
0
        *replace = (unsigned char)value;
8678
0
        return x;
8679
0
    }
8680
0
    else if (PyBytes_Check(x))
8681
0
        return x;
8682
0
    else {
8683
        /* wrong return value */
8684
0
        PyErr_Format(PyExc_TypeError,
8685
0
                     "character mapping must return integer, bytes or None, not %.400s",
8686
0
                     Py_TYPE(x)->tp_name);
8687
0
        Py_DECREF(x);
8688
0
        return NULL;
8689
0
    }
8690
0
}
8691
8692
static int
8693
charmapencode_resize(PyBytesWriter *writer, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8694
0
{
8695
0
    Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8696
    /* exponentially overallocate to minimize reallocations */
8697
0
    if (requiredsize < 2 * outsize)
8698
0
        requiredsize = 2 * outsize;
8699
0
    return PyBytesWriter_Resize(writer, requiredsize);
8700
0
}
8701
8702
typedef enum charmapencode_result {
8703
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8704
} charmapencode_result;
8705
/* lookup the character, put the result in the output string and adjust
8706
   various state variables. Resize the output bytes object if not enough
8707
   space is available. Return a new reference to the object that
8708
   was put in the output buffer, or Py_None, if the mapping was undefined
8709
   (in which case no character was written) or NULL, if a
8710
   reallocation error occurred. The caller must decref the result */
8711
static charmapencode_result
8712
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8713
                     PyBytesWriter *writer, Py_ssize_t *outpos)
8714
0
{
8715
0
    PyObject *rep;
8716
0
    unsigned char replace;
8717
0
    char *outstart;
8718
0
    Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8719
8720
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8721
0
        int res = encoding_map_lookup(c, mapping);
8722
0
        Py_ssize_t requiredsize = *outpos+1;
8723
0
        if (res == -1) {
8724
0
            return enc_FAILED;
8725
0
        }
8726
8727
0
        if (outsize<requiredsize) {
8728
0
            if (charmapencode_resize(writer, outpos, requiredsize)) {
8729
0
                return enc_EXCEPTION;
8730
0
            }
8731
0
        }
8732
0
        outstart = _PyBytesWriter_GetData(writer);
8733
0
        outstart[(*outpos)++] = (char)res;
8734
0
        return enc_SUCCESS;
8735
0
    }
8736
8737
0
    rep = charmapencode_lookup(c, mapping, &replace);
8738
0
    if (rep==NULL)
8739
0
        return enc_EXCEPTION;
8740
0
    else if (rep==Py_None) {
8741
0
        Py_DECREF(rep);
8742
0
        return enc_FAILED;
8743
0
    } else {
8744
0
        if (PyLong_Check(rep)) {
8745
0
            Py_ssize_t requiredsize = *outpos+1;
8746
0
            if (outsize<requiredsize)
8747
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8748
0
                    Py_DECREF(rep);
8749
0
                    return enc_EXCEPTION;
8750
0
                }
8751
0
            outstart = _PyBytesWriter_GetData(writer);
8752
0
            outstart[(*outpos)++] = (char)replace;
8753
0
        }
8754
0
        else {
8755
0
            const char *repchars = PyBytes_AS_STRING(rep);
8756
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8757
0
            Py_ssize_t requiredsize = *outpos+repsize;
8758
0
            if (outsize<requiredsize)
8759
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8760
0
                    Py_DECREF(rep);
8761
0
                    return enc_EXCEPTION;
8762
0
                }
8763
0
            outstart = _PyBytesWriter_GetData(writer);
8764
0
            memcpy(outstart + *outpos, repchars, repsize);
8765
0
            *outpos += repsize;
8766
0
        }
8767
0
    }
8768
0
    Py_DECREF(rep);
8769
0
    return enc_SUCCESS;
8770
0
}
8771
8772
/* handle an error in _PyUnicode_EncodeCharmap()
8773
   Return 0 on success, -1 on error */
8774
static int
8775
charmap_encoding_error(
8776
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8777
    PyObject **exceptionObject,
8778
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8779
    PyBytesWriter *writer, Py_ssize_t *respos)
8780
0
{
8781
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8782
0
    Py_ssize_t size, repsize;
8783
0
    Py_ssize_t newpos;
8784
0
    int kind;
8785
0
    const void *data;
8786
0
    Py_ssize_t index;
8787
    /* startpos for collecting unencodable chars */
8788
0
    Py_ssize_t collstartpos = *inpos;
8789
0
    Py_ssize_t collendpos = *inpos+1;
8790
0
    Py_ssize_t collpos;
8791
0
    const char *encoding = "charmap";
8792
0
    const char *reason = "character maps to <undefined>";
8793
0
    charmapencode_result x;
8794
0
    Py_UCS4 ch;
8795
0
    int val;
8796
8797
0
    size = PyUnicode_GET_LENGTH(unicode);
8798
    /* find all unencodable characters */
8799
0
    while (collendpos < size) {
8800
0
        PyObject *rep;
8801
0
        unsigned char replace;
8802
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8803
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8804
0
            val = encoding_map_lookup(ch, mapping);
8805
0
            if (val != -1)
8806
0
                break;
8807
0
            ++collendpos;
8808
0
            continue;
8809
0
        }
8810
8811
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8812
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8813
0
        if (rep==NULL)
8814
0
            return -1;
8815
0
        else if (rep!=Py_None) {
8816
0
            Py_DECREF(rep);
8817
0
            break;
8818
0
        }
8819
0
        Py_DECREF(rep);
8820
0
        ++collendpos;
8821
0
    }
8822
    /* cache callback name lookup
8823
     * (if not done yet, i.e. it's the first error) */
8824
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8825
0
        *error_handler = _Py_GetErrorHandler(errors);
8826
8827
0
    switch (*error_handler) {
8828
0
    case _Py_ERROR_STRICT:
8829
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8830
0
        return -1;
8831
8832
0
    case _Py_ERROR_REPLACE:
8833
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8834
0
            x = charmapencode_output('?', mapping, writer, respos);
8835
0
            if (x==enc_EXCEPTION) {
8836
0
                return -1;
8837
0
            }
8838
0
            else if (x==enc_FAILED) {
8839
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8840
0
                return -1;
8841
0
            }
8842
0
        }
8843
0
        _Py_FALLTHROUGH;
8844
0
    case _Py_ERROR_IGNORE:
8845
0
        *inpos = collendpos;
8846
0
        break;
8847
8848
0
    case _Py_ERROR_XMLCHARREFREPLACE:
8849
        /* generate replacement (temporarily (mis)uses p) */
8850
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8851
0
            char buffer[2+29+1+1];
8852
0
            char *cp;
8853
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8854
0
            for (cp = buffer; *cp; ++cp) {
8855
0
                x = charmapencode_output(*cp, mapping, writer, respos);
8856
0
                if (x==enc_EXCEPTION)
8857
0
                    return -1;
8858
0
                else if (x==enc_FAILED) {
8859
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8860
0
                    return -1;
8861
0
                }
8862
0
            }
8863
0
        }
8864
0
        *inpos = collendpos;
8865
0
        break;
8866
8867
0
    default:
8868
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8869
0
                                                      encoding, reason, unicode, exceptionObject,
8870
0
                                                      collstartpos, collendpos, &newpos);
8871
0
        if (repunicode == NULL)
8872
0
            return -1;
8873
0
        if (PyBytes_Check(repunicode)) {
8874
            /* Directly copy bytes result to output. */
8875
0
            Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8876
0
            Py_ssize_t requiredsize;
8877
0
            repsize = PyBytes_Size(repunicode);
8878
0
            requiredsize = *respos + repsize;
8879
0
            if (requiredsize > outsize)
8880
                /* Make room for all additional bytes. */
8881
0
                if (charmapencode_resize(writer, respos, requiredsize)) {
8882
0
                    Py_DECREF(repunicode);
8883
0
                    return -1;
8884
0
                }
8885
0
            memcpy((char*)PyBytesWriter_GetData(writer) + *respos,
8886
0
                   PyBytes_AsString(repunicode),  repsize);
8887
0
            *respos += repsize;
8888
0
            *inpos = newpos;
8889
0
            Py_DECREF(repunicode);
8890
0
            break;
8891
0
        }
8892
        /* generate replacement  */
8893
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
8894
0
        data = PyUnicode_DATA(repunicode);
8895
0
        kind = PyUnicode_KIND(repunicode);
8896
0
        for (index = 0; index < repsize; index++) {
8897
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8898
0
            x = charmapencode_output(repch, mapping, writer, respos);
8899
0
            if (x==enc_EXCEPTION) {
8900
0
                Py_DECREF(repunicode);
8901
0
                return -1;
8902
0
            }
8903
0
            else if (x==enc_FAILED) {
8904
0
                Py_DECREF(repunicode);
8905
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8906
0
                return -1;
8907
0
            }
8908
0
        }
8909
0
        *inpos = newpos;
8910
0
        Py_DECREF(repunicode);
8911
0
    }
8912
0
    return 0;
8913
0
}
8914
8915
PyObject *
8916
_PyUnicode_EncodeCharmap(PyObject *unicode,
8917
                         PyObject *mapping,
8918
                         const char *errors)
8919
0
{
8920
    /* Default to Latin-1 */
8921
0
    if (mapping == NULL) {
8922
0
        return unicode_encode_ucs1(unicode, errors, 256);
8923
0
    }
8924
8925
0
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
8926
0
    if (size == 0) {
8927
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8928
0
    }
8929
0
    const void *data = PyUnicode_DATA(unicode);
8930
0
    int kind = PyUnicode_KIND(unicode);
8931
8932
0
    PyObject *error_handler_obj = NULL;
8933
0
    PyObject *exc = NULL;
8934
8935
    /* output object */
8936
0
    PyBytesWriter *writer;
8937
    /* allocate enough for a simple encoding without
8938
       replacements, if we need more, we'll resize */
8939
0
    writer = PyBytesWriter_Create(size);
8940
0
    if (writer == NULL) {
8941
0
        goto onError;
8942
0
    }
8943
8944
    /* current input position */
8945
0
    Py_ssize_t inpos = 0;
8946
    /* current output position */
8947
0
    Py_ssize_t respos = 0;
8948
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8949
8950
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8951
0
        char *outstart = _PyBytesWriter_GetData(writer);
8952
0
        Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8953
8954
0
        while (inpos<size) {
8955
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8956
8957
            /* try to encode it */
8958
0
            int res = encoding_map_lookup(ch, mapping);
8959
0
            Py_ssize_t requiredsize = respos+1;
8960
0
            if (res == -1) {
8961
0
                goto enc_FAILED;
8962
0
            }
8963
8964
0
            if (outsize<requiredsize) {
8965
0
                if (charmapencode_resize(writer, &respos, requiredsize)) {
8966
0
                    goto onError;
8967
0
                }
8968
0
                outstart = _PyBytesWriter_GetData(writer);
8969
0
                outsize = _PyBytesWriter_GetSize(writer);
8970
0
            }
8971
0
            outstart[respos++] = (char)res;
8972
8973
            /* done with this character => adjust input position */
8974
0
            ++inpos;
8975
0
            continue;
8976
8977
0
enc_FAILED:
8978
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
8979
0
                                       &exc,
8980
0
                                       &error_handler, &error_handler_obj, errors,
8981
0
                                       writer, &respos)) {
8982
0
                goto onError;
8983
0
            }
8984
0
            outstart = _PyBytesWriter_GetData(writer);
8985
0
            outsize = _PyBytesWriter_GetSize(writer);
8986
0
        }
8987
0
    }
8988
0
    else {
8989
0
        while (inpos<size) {
8990
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8991
            /* try to encode it */
8992
0
            charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
8993
0
            if (x==enc_EXCEPTION) { /* error */
8994
0
                goto onError;
8995
0
            }
8996
0
            if (x==enc_FAILED) { /* unencodable character */
8997
0
                if (charmap_encoding_error(unicode, &inpos, mapping,
8998
0
                                           &exc,
8999
0
                                           &error_handler, &error_handler_obj, errors,
9000
0
                                           writer, &respos)) {
9001
0
                    goto onError;
9002
0
                }
9003
0
            }
9004
0
            else {
9005
                /* done with this character => adjust input position */
9006
0
                ++inpos;
9007
0
            }
9008
0
        }
9009
0
    }
9010
9011
0
    Py_XDECREF(exc);
9012
0
    Py_XDECREF(error_handler_obj);
9013
9014
    /* Resize if we allocated too much */
9015
0
    return PyBytesWriter_FinishWithSize(writer, respos);
9016
9017
0
  onError:
9018
0
    PyBytesWriter_Discard(writer);
9019
0
    Py_XDECREF(exc);
9020
0
    Py_XDECREF(error_handler_obj);
9021
0
    return NULL;
9022
0
}
9023
9024
PyObject *
9025
PyUnicode_AsCharmapString(PyObject *unicode,
9026
                          PyObject *mapping)
9027
0
{
9028
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9029
0
        PyErr_BadArgument();
9030
0
        return NULL;
9031
0
    }
9032
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9033
0
}
9034
9035
/* create or adjust a UnicodeTranslateError */
9036
static void
9037
make_translate_exception(PyObject **exceptionObject,
9038
                         PyObject *unicode,
9039
                         Py_ssize_t startpos, Py_ssize_t endpos,
9040
                         const char *reason)
9041
0
{
9042
0
    if (*exceptionObject == NULL) {
9043
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9044
0
            unicode, startpos, endpos, reason);
9045
0
    }
9046
0
    else {
9047
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9048
0
            goto onError;
9049
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9050
0
            goto onError;
9051
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9052
0
            goto onError;
9053
0
        return;
9054
0
      onError:
9055
0
        Py_CLEAR(*exceptionObject);
9056
0
    }
9057
0
}
9058
9059
/* error handling callback helper:
9060
   build arguments, call the callback and check the arguments,
9061
   put the result into newpos and return the replacement string, which
9062
   has to be freed by the caller */
9063
static PyObject *
9064
unicode_translate_call_errorhandler(const char *errors,
9065
                                    PyObject **errorHandler,
9066
                                    const char *reason,
9067
                                    PyObject *unicode, PyObject **exceptionObject,
9068
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9069
                                    Py_ssize_t *newpos)
9070
0
{
9071
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9072
9073
0
    Py_ssize_t i_newpos;
9074
0
    PyObject *restuple;
9075
0
    PyObject *resunicode;
9076
9077
0
    if (*errorHandler == NULL) {
9078
0
        *errorHandler = PyCodec_LookupError(errors);
9079
0
        if (*errorHandler == NULL)
9080
0
            return NULL;
9081
0
    }
9082
9083
0
    make_translate_exception(exceptionObject,
9084
0
                             unicode, startpos, endpos, reason);
9085
0
    if (*exceptionObject == NULL)
9086
0
        return NULL;
9087
9088
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9089
0
    if (restuple == NULL)
9090
0
        return NULL;
9091
0
    if (!PyTuple_Check(restuple)) {
9092
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9093
0
        Py_DECREF(restuple);
9094
0
        return NULL;
9095
0
    }
9096
0
    if (!PyArg_ParseTuple(restuple, argparse,
9097
0
                          &resunicode, &i_newpos)) {
9098
0
        Py_DECREF(restuple);
9099
0
        return NULL;
9100
0
    }
9101
0
    if (i_newpos<0)
9102
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9103
0
    else
9104
0
        *newpos = i_newpos;
9105
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9106
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9107
0
        Py_DECREF(restuple);
9108
0
        return NULL;
9109
0
    }
9110
0
    Py_INCREF(resunicode);
9111
0
    Py_DECREF(restuple);
9112
0
    return resunicode;
9113
0
}
9114
9115
/* Lookup the character ch in the mapping and put the result in result,
9116
   which must be decrefed by the caller.
9117
   The result can be PyLong, PyUnicode, None or NULL.
9118
   If the result is PyLong, put its value in replace.
9119
   Return 0 on success, -1 on error */
9120
static int
9121
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9122
18.8k
{
9123
18.8k
    PyObject *w = PyLong_FromLong((long)c);
9124
18.8k
    PyObject *x;
9125
9126
18.8k
    if (w == NULL)
9127
0
        return -1;
9128
18.8k
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9129
18.8k
    Py_DECREF(w);
9130
18.8k
    if (rc == 0) {
9131
        /* No mapping found means: use 1:1 mapping. */
9132
6.35k
        *result = NULL;
9133
6.35k
        return 0;
9134
6.35k
    }
9135
12.4k
    if (x == NULL) {
9136
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9137
            /* No mapping found means: use 1:1 mapping. */
9138
0
            PyErr_Clear();
9139
0
            *result = NULL;
9140
0
            return 0;
9141
0
        } else
9142
0
            return -1;
9143
0
    }
9144
12.4k
    else if (x == Py_None) {
9145
0
        *result = x;
9146
0
        return 0;
9147
0
    }
9148
12.4k
    else if (PyLong_Check(x)) {
9149
0
        long value = PyLong_AsLong(x);
9150
0
        if (value < 0 || value > MAX_UNICODE) {
9151
0
            PyErr_Format(PyExc_ValueError,
9152
0
                         "character mapping must be in range(0x%lx)",
9153
0
                         (unsigned long)MAX_UNICODE + 1);
9154
0
            Py_DECREF(x);
9155
0
            return -1;
9156
0
        }
9157
0
        *result = x;
9158
0
        *replace = (Py_UCS4)value;
9159
0
        return 0;
9160
0
    }
9161
12.4k
    else if (PyUnicode_Check(x)) {
9162
12.4k
        *result = x;
9163
12.4k
        return 0;
9164
12.4k
    }
9165
0
    else {
9166
        /* wrong return value */
9167
0
        PyErr_SetString(PyExc_TypeError,
9168
0
                        "character mapping must return integer, None or str");
9169
0
        Py_DECREF(x);
9170
0
        return -1;
9171
0
    }
9172
12.4k
}
9173
9174
/* lookup the character, write the result into the writer.
9175
   Return 1 if the result was written into the writer, return 0 if the mapping
9176
   was undefined, raise an exception return -1 on error. */
9177
static int
9178
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9179
                        _PyUnicodeWriter *writer)
9180
6.39k
{
9181
6.39k
    PyObject *item;
9182
6.39k
    Py_UCS4 replace;
9183
9184
6.39k
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9185
0
        return -1;
9186
9187
6.39k
    if (item == NULL) {
9188
        /* not found => default to 1:1 mapping */
9189
104
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9190
0
            return -1;
9191
0
        }
9192
104
        return 1;
9193
104
    }
9194
9195
6.29k
    if (item == Py_None) {
9196
0
        Py_DECREF(item);
9197
0
        return 0;
9198
0
    }
9199
9200
6.29k
    if (PyLong_Check(item)) {
9201
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9202
0
            Py_DECREF(item);
9203
0
            return -1;
9204
0
        }
9205
0
        Py_DECREF(item);
9206
0
        return 1;
9207
0
    }
9208
9209
6.29k
    if (!PyUnicode_Check(item)) {
9210
0
        Py_DECREF(item);
9211
0
        return -1;
9212
0
    }
9213
9214
6.29k
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9215
0
        Py_DECREF(item);
9216
0
        return -1;
9217
0
    }
9218
9219
6.29k
    Py_DECREF(item);
9220
6.29k
    return 1;
9221
6.29k
}
9222
9223
static int
9224
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9225
                              Py_UCS1 *translate)
9226
12.4k
{
9227
12.4k
    PyObject *item = NULL;
9228
12.4k
    Py_UCS4 replace;
9229
12.4k
    int ret = 0;
9230
9231
12.4k
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9232
0
        return -1;
9233
0
    }
9234
9235
12.4k
    if (item == Py_None) {
9236
        /* deletion */
9237
0
        translate[ch] = 0xfe;
9238
0
    }
9239
12.4k
    else if (item == NULL) {
9240
        /* not found => default to 1:1 mapping */
9241
6.25k
        translate[ch] = ch;
9242
6.25k
        return 1;
9243
6.25k
    }
9244
6.18k
    else if (PyLong_Check(item)) {
9245
0
        if (replace > 127) {
9246
            /* invalid character or character outside ASCII:
9247
               skip the fast translate */
9248
0
            goto exit;
9249
0
        }
9250
0
        translate[ch] = (Py_UCS1)replace;
9251
0
    }
9252
6.18k
    else if (PyUnicode_Check(item)) {
9253
6.18k
        if (PyUnicode_GET_LENGTH(item) != 1)
9254
6.18k
            goto exit;
9255
9256
0
        replace = PyUnicode_READ_CHAR(item, 0);
9257
0
        if (replace > 127)
9258
0
            goto exit;
9259
0
        translate[ch] = (Py_UCS1)replace;
9260
0
    }
9261
0
    else {
9262
        /* not None, NULL, long or unicode */
9263
0
        goto exit;
9264
0
    }
9265
0
    ret = 1;
9266
9267
6.18k
  exit:
9268
6.18k
    Py_DECREF(item);
9269
6.18k
    return ret;
9270
0
}
9271
9272
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9273
   was translated into writer, return 0 if the input string was partially
9274
   translated into writer, raise an exception and return -1 on error. */
9275
static int
9276
unicode_fast_translate(PyObject *input, PyObject *mapping,
9277
                       _PyUnicodeWriter *writer, int ignore,
9278
                       Py_ssize_t *input_pos)
9279
12.3k
{
9280
12.3k
    Py_UCS1 ascii_table[128], ch, ch2;
9281
12.3k
    Py_ssize_t len;
9282
12.3k
    const Py_UCS1 *in, *end;
9283
12.3k
    Py_UCS1 *out;
9284
12.3k
    int res = 0;
9285
9286
12.3k
    len = PyUnicode_GET_LENGTH(input);
9287
9288
12.3k
    memset(ascii_table, 0xff, 128);
9289
9290
12.3k
    in = PyUnicode_1BYTE_DATA(input);
9291
12.3k
    end = in + len;
9292
9293
12.3k
    assert(PyUnicode_IS_ASCII(writer->buffer));
9294
12.3k
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9295
12.3k
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9296
9297
18.6k
    for (; in < end; in++) {
9298
12.4k
        ch = *in;
9299
12.4k
        ch2 = ascii_table[ch];
9300
12.4k
        if (ch2 == 0xff) {
9301
12.4k
            int translate = unicode_fast_translate_lookup(mapping, ch,
9302
12.4k
                                                          ascii_table);
9303
12.4k
            if (translate < 0)
9304
0
                return -1;
9305
12.4k
            if (translate == 0)
9306
6.18k
                goto exit;
9307
6.25k
            ch2 = ascii_table[ch];
9308
6.25k
        }
9309
6.29k
        if (ch2 == 0xfe) {
9310
0
            if (ignore)
9311
0
                continue;
9312
0
            goto exit;
9313
0
        }
9314
6.29k
        assert(ch2 < 128);
9315
6.29k
        *out = ch2;
9316
6.29k
        out++;
9317
6.29k
    }
9318
6.17k
    res = 1;
9319
9320
12.3k
exit:
9321
12.3k
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9322
12.3k
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9323
12.3k
    return res;
9324
6.17k
}
9325
9326
static PyObject *
9327
_PyUnicode_TranslateCharmap(PyObject *input,
9328
                            PyObject *mapping,
9329
                            const char *errors)
9330
12.3k
{
9331
    /* input object */
9332
12.3k
    const void *data;
9333
12.3k
    Py_ssize_t size, i;
9334
12.3k
    int kind;
9335
    /* output buffer */
9336
12.3k
    _PyUnicodeWriter writer;
9337
    /* error handler */
9338
12.3k
    const char *reason = "character maps to <undefined>";
9339
12.3k
    PyObject *errorHandler = NULL;
9340
12.3k
    PyObject *exc = NULL;
9341
12.3k
    int ignore;
9342
12.3k
    int res;
9343
9344
12.3k
    if (mapping == NULL) {
9345
0
        PyErr_BadArgument();
9346
0
        return NULL;
9347
0
    }
9348
9349
12.3k
    data = PyUnicode_DATA(input);
9350
12.3k
    kind = PyUnicode_KIND(input);
9351
12.3k
    size = PyUnicode_GET_LENGTH(input);
9352
9353
12.3k
    if (size == 0)
9354
0
        return PyUnicode_FromObject(input);
9355
9356
    /* allocate enough for a simple 1:1 translation without
9357
       replacements, if we need more, we'll resize */
9358
12.3k
    _PyUnicodeWriter_Init(&writer);
9359
12.3k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9360
0
        goto onError;
9361
9362
12.3k
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9363
9364
12.3k
    if (PyUnicode_IS_ASCII(input)) {
9365
12.3k
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9366
12.3k
        if (res < 0) {
9367
0
            _PyUnicodeWriter_Dealloc(&writer);
9368
0
            return NULL;
9369
0
        }
9370
12.3k
        if (res == 1)
9371
6.17k
            return _PyUnicodeWriter_Finish(&writer);
9372
12.3k
    }
9373
0
    else {
9374
0
        i = 0;
9375
0
    }
9376
9377
12.5k
    while (i<size) {
9378
        /* try to encode it */
9379
6.39k
        int translate;
9380
6.39k
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9381
6.39k
        Py_ssize_t newpos;
9382
        /* startpos for collecting untranslatable chars */
9383
6.39k
        Py_ssize_t collstart;
9384
6.39k
        Py_ssize_t collend;
9385
6.39k
        Py_UCS4 ch;
9386
9387
6.39k
        ch = PyUnicode_READ(kind, data, i);
9388
6.39k
        translate = charmaptranslate_output(ch, mapping, &writer);
9389
6.39k
        if (translate < 0)
9390
0
            goto onError;
9391
9392
6.39k
        if (translate != 0) {
9393
            /* it worked => adjust input pointer */
9394
6.39k
            ++i;
9395
6.39k
            continue;
9396
6.39k
        }
9397
9398
        /* untranslatable character */
9399
0
        collstart = i;
9400
0
        collend = i+1;
9401
9402
        /* find all untranslatable characters */
9403
0
        while (collend < size) {
9404
0
            PyObject *x;
9405
0
            Py_UCS4 replace;
9406
0
            ch = PyUnicode_READ(kind, data, collend);
9407
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9408
0
                goto onError;
9409
0
            Py_XDECREF(x);
9410
0
            if (x != Py_None)
9411
0
                break;
9412
0
            ++collend;
9413
0
        }
9414
9415
0
        if (ignore) {
9416
0
            i = collend;
9417
0
        }
9418
0
        else {
9419
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9420
0
                                                             reason, input, &exc,
9421
0
                                                             collstart, collend, &newpos);
9422
0
            if (repunicode == NULL)
9423
0
                goto onError;
9424
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9425
0
                Py_DECREF(repunicode);
9426
0
                goto onError;
9427
0
            }
9428
0
            Py_DECREF(repunicode);
9429
0
            i = newpos;
9430
0
        }
9431
0
    }
9432
6.18k
    Py_XDECREF(exc);
9433
6.18k
    Py_XDECREF(errorHandler);
9434
6.18k
    return _PyUnicodeWriter_Finish(&writer);
9435
9436
0
  onError:
9437
0
    _PyUnicodeWriter_Dealloc(&writer);
9438
0
    Py_XDECREF(exc);
9439
0
    Py_XDECREF(errorHandler);
9440
0
    return NULL;
9441
6.18k
}
9442
9443
PyObject *
9444
PyUnicode_Translate(PyObject *str,
9445
                    PyObject *mapping,
9446
                    const char *errors)
9447
0
{
9448
0
    if (ensure_unicode(str) < 0)
9449
0
        return NULL;
9450
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9451
0
}
9452
9453
PyObject *
9454
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9455
12.5M
{
9456
12.5M
    if (!PyUnicode_Check(unicode)) {
9457
0
        PyErr_BadInternalCall();
9458
0
        return NULL;
9459
0
    }
9460
12.5M
    if (PyUnicode_IS_ASCII(unicode)) {
9461
        /* If the string is already ASCII, just return the same string */
9462
12.5M
        return Py_NewRef(unicode);
9463
12.5M
    }
9464
9465
2.33k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9466
2.33k
    PyObject *result = PyUnicode_New(len, 127);
9467
2.33k
    if (result == NULL) {
9468
0
        return NULL;
9469
0
    }
9470
9471
2.33k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9472
2.33k
    int kind = PyUnicode_KIND(unicode);
9473
2.33k
    const void *data = PyUnicode_DATA(unicode);
9474
2.33k
    Py_ssize_t i;
9475
80.6k
    for (i = 0; i < len; ++i) {
9476
78.4k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9477
78.4k
        if (ch < 127) {
9478
75.7k
            out[i] = ch;
9479
75.7k
        }
9480
2.78k
        else if (Py_UNICODE_ISSPACE(ch)) {
9481
1.20k
            out[i] = ' ';
9482
1.20k
        }
9483
1.58k
        else {
9484
1.58k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9485
1.58k
            if (decimal < 0) {
9486
129
                out[i] = '?';
9487
129
                out[i+1] = '\0';
9488
129
                _PyUnicode_LENGTH(result) = i + 1;
9489
129
                break;
9490
129
            }
9491
1.45k
            out[i] = '0' + decimal;
9492
1.45k
        }
9493
78.4k
    }
9494
9495
2.33k
    assert(_PyUnicode_CheckConsistency(result, 1));
9496
2.33k
    return result;
9497
2.33k
}
9498
9499
/* --- Helpers ------------------------------------------------------------ */
9500
9501
/* helper macro to fixup start/end slice values */
9502
#define ADJUST_INDICES(start, end, len) \
9503
111M
    do {                                \
9504
111M
        if (end > len) {                \
9505
84.8M
            end = len;                  \
9506
84.8M
        }                               \
9507
111M
        else if (end < 0) {             \
9508
0
            end += len;                 \
9509
0
            if (end < 0) {              \
9510
0
                end = 0;                \
9511
0
            }                           \
9512
0
        }                               \
9513
111M
        if (start < 0) {                \
9514
17.8k
            start += len;               \
9515
17.8k
            if (start < 0) {            \
9516
0
                start = 0;              \
9517
0
            }                           \
9518
17.8k
        }                               \
9519
111M
    } while (0)
9520
9521
static Py_ssize_t
9522
any_find_slice(PyObject* s1, PyObject* s2,
9523
               Py_ssize_t start,
9524
               Py_ssize_t end,
9525
               int direction)
9526
25.2M
{
9527
25.2M
    int kind1, kind2;
9528
25.2M
    const void *buf1, *buf2;
9529
25.2M
    Py_ssize_t len1, len2, result;
9530
9531
25.2M
    kind1 = PyUnicode_KIND(s1);
9532
25.2M
    kind2 = PyUnicode_KIND(s2);
9533
25.2M
    if (kind1 < kind2)
9534
0
        return -1;
9535
9536
25.2M
    len1 = PyUnicode_GET_LENGTH(s1);
9537
25.2M
    len2 = PyUnicode_GET_LENGTH(s2);
9538
25.2M
    ADJUST_INDICES(start, end, len1);
9539
25.2M
    if (end - start < len2)
9540
1.56M
        return -1;
9541
9542
23.6M
    buf1 = PyUnicode_DATA(s1);
9543
23.6M
    buf2 = PyUnicode_DATA(s2);
9544
23.6M
    if (len2 == 1) {
9545
22.8M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9546
22.8M
        result = findchar((const char *)buf1 + kind1*start,
9547
22.8M
                          kind1, end - start, ch, direction);
9548
22.8M
        if (result == -1)
9549
3.78M
            return -1;
9550
19.0M
        else
9551
19.0M
            return start + result;
9552
22.8M
    }
9553
9554
810k
    if (kind2 != kind1) {
9555
188k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9556
188k
        if (!buf2)
9557
0
            return -2;
9558
188k
    }
9559
9560
810k
    if (direction > 0) {
9561
810k
        switch (kind1) {
9562
622k
        case PyUnicode_1BYTE_KIND:
9563
622k
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9564
368k
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9565
254k
            else
9566
254k
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9567
622k
            break;
9568
67.6k
        case PyUnicode_2BYTE_KIND:
9569
67.6k
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9570
67.6k
            break;
9571
120k
        case PyUnicode_4BYTE_KIND:
9572
120k
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9573
120k
            break;
9574
0
        default:
9575
0
            Py_UNREACHABLE();
9576
810k
        }
9577
810k
    }
9578
0
    else {
9579
0
        switch (kind1) {
9580
0
        case PyUnicode_1BYTE_KIND:
9581
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9582
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9583
0
            else
9584
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9585
0
            break;
9586
0
        case PyUnicode_2BYTE_KIND:
9587
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9588
0
            break;
9589
0
        case PyUnicode_4BYTE_KIND:
9590
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9591
0
            break;
9592
0
        default:
9593
0
            Py_UNREACHABLE();
9594
0
        }
9595
0
    }
9596
9597
810k
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9598
810k
    if (kind2 != kind1)
9599
188k
        PyMem_Free((void *)buf2);
9600
9601
810k
    return result;
9602
810k
}
9603
9604
9605
Py_ssize_t
9606
PyUnicode_Count(PyObject *str,
9607
                PyObject *substr,
9608
                Py_ssize_t start,
9609
                Py_ssize_t end)
9610
0
{
9611
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9612
0
        return -1;
9613
9614
0
    return unicode_count_impl(str, substr, start, end);
9615
0
}
9616
9617
Py_ssize_t
9618
PyUnicode_Find(PyObject *str,
9619
               PyObject *substr,
9620
               Py_ssize_t start,
9621
               Py_ssize_t end,
9622
               int direction)
9623
0
{
9624
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9625
0
        return -2;
9626
9627
0
    return any_find_slice(str, substr, start, end, direction);
9628
0
}
9629
9630
Py_ssize_t
9631
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9632
                   Py_ssize_t start, Py_ssize_t end,
9633
                   int direction)
9634
3.80M
{
9635
3.80M
    int kind;
9636
3.80M
    Py_ssize_t len, result;
9637
3.80M
    len = PyUnicode_GET_LENGTH(str);
9638
3.80M
    ADJUST_INDICES(start, end, len);
9639
3.80M
    if (end - start < 1)
9640
0
        return -1;
9641
3.80M
    kind = PyUnicode_KIND(str);
9642
3.80M
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9643
3.80M
                      kind, end-start, ch, direction);
9644
3.80M
    if (result == -1)
9645
2.83M
        return -1;
9646
962k
    else
9647
962k
        return start + result;
9648
3.80M
}
9649
9650
static int
9651
tailmatch(PyObject *self,
9652
          PyObject *substring,
9653
          Py_ssize_t start,
9654
          Py_ssize_t end,
9655
          int direction)
9656
56.3M
{
9657
56.3M
    int kind_self;
9658
56.3M
    int kind_sub;
9659
56.3M
    const void *data_self;
9660
56.3M
    const void *data_sub;
9661
56.3M
    Py_ssize_t offset;
9662
56.3M
    Py_ssize_t i;
9663
56.3M
    Py_ssize_t end_sub;
9664
9665
56.3M
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9666
56.3M
    end -= PyUnicode_GET_LENGTH(substring);
9667
56.3M
    if (end < start)
9668
8.23M
        return 0;
9669
9670
48.1M
    if (PyUnicode_GET_LENGTH(substring) == 0)
9671
0
        return 1;
9672
9673
48.1M
    kind_self = PyUnicode_KIND(self);
9674
48.1M
    data_self = PyUnicode_DATA(self);
9675
48.1M
    kind_sub = PyUnicode_KIND(substring);
9676
48.1M
    data_sub = PyUnicode_DATA(substring);
9677
48.1M
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9678
9679
48.1M
    if (direction > 0)
9680
7.15M
        offset = end;
9681
40.9M
    else
9682
40.9M
        offset = start;
9683
9684
48.1M
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9685
48.1M
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9686
33.0M
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9687
33.0M
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9688
        /* If both are of the same kind, memcmp is sufficient */
9689
13.1M
        if (kind_self == kind_sub) {
9690
6.45M
            return ! memcmp((char *)data_self +
9691
6.45M
                                (offset * PyUnicode_KIND(substring)),
9692
6.45M
                            data_sub,
9693
6.45M
                            PyUnicode_GET_LENGTH(substring) *
9694
6.45M
                                PyUnicode_KIND(substring));
9695
6.45M
        }
9696
        /* otherwise we have to compare each character by first accessing it */
9697
6.65M
        else {
9698
            /* We do not need to compare 0 and len(substring)-1 because
9699
               the if statement above ensured already that they are equal
9700
               when we end up here. */
9701
6.80M
            for (i = 1; i < end_sub; ++i) {
9702
177k
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9703
177k
                    PyUnicode_READ(kind_sub, data_sub, i))
9704
26.4k
                    return 0;
9705
177k
            }
9706
6.63M
            return 1;
9707
6.65M
        }
9708
13.1M
    }
9709
9710
34.9M
    return 0;
9711
48.1M
}
9712
9713
Py_ssize_t
9714
PyUnicode_Tailmatch(PyObject *str,
9715
                    PyObject *substr,
9716
                    Py_ssize_t start,
9717
                    Py_ssize_t end,
9718
                    int direction)
9719
264
{
9720
264
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9721
0
        return -1;
9722
9723
264
    return tailmatch(str, substr, start, end, direction);
9724
264
}
9725
9726
static PyObject *
9727
ascii_upper_or_lower(PyObject *self, int lower)
9728
66.5M
{
9729
66.5M
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9730
66.5M
    const char *data = PyUnicode_DATA(self);
9731
66.5M
    char *resdata;
9732
66.5M
    PyObject *res;
9733
9734
66.5M
    res = PyUnicode_New(len, 127);
9735
66.5M
    if (res == NULL)
9736
0
        return NULL;
9737
66.5M
    resdata = PyUnicode_DATA(res);
9738
66.5M
    if (lower)
9739
66.5M
        _Py_bytes_lower(resdata, data, len);
9740
306
    else
9741
306
        _Py_bytes_upper(resdata, data, len);
9742
66.5M
    return res;
9743
66.5M
}
9744
9745
static Py_UCS4
9746
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9747
461k
{
9748
461k
    Py_ssize_t j;
9749
461k
    int final_sigma;
9750
461k
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9751
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9752
9753
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9754
9755
    where ! is a negation and \p{xxx} is a character with property xxx.
9756
    */
9757
848k
    for (j = i - 1; j >= 0; j--) {
9758
845k
        c = PyUnicode_READ(kind, data, j);
9759
845k
        if (!_PyUnicode_IsCaseIgnorable(c))
9760
459k
            break;
9761
845k
    }
9762
461k
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9763
461k
    if (final_sigma) {
9764
710k
        for (j = i + 1; j < length; j++) {
9765
706k
            c = PyUnicode_READ(kind, data, j);
9766
706k
            if (!_PyUnicode_IsCaseIgnorable(c))
9767
355k
                break;
9768
706k
        }
9769
359k
        final_sigma = j == length || !_PyUnicode_IsCased(c);
9770
359k
    }
9771
461k
    return (final_sigma) ? 0x3C2 : 0x3C3;
9772
461k
}
9773
9774
static int
9775
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9776
           Py_UCS4 c, Py_UCS4 *mapped)
9777
127M
{
9778
    /* Obscure special case. */
9779
127M
    if (c == 0x3A3) {
9780
461k
        mapped[0] = handle_capital_sigma(kind, data, length, i);
9781
461k
        return 1;
9782
461k
    }
9783
127M
    return _PyUnicode_ToLowerFull(c, mapped);
9784
127M
}
9785
9786
static Py_ssize_t
9787
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9788
0
{
9789
0
    Py_ssize_t i, k = 0;
9790
0
    int n_res, j;
9791
0
    Py_UCS4 c, mapped[3];
9792
9793
0
    c = PyUnicode_READ(kind, data, 0);
9794
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
9795
0
    for (j = 0; j < n_res; j++) {
9796
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
9797
0
        res[k++] = mapped[j];
9798
0
    }
9799
0
    for (i = 1; i < length; i++) {
9800
0
        c = PyUnicode_READ(kind, data, i);
9801
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9802
0
        for (j = 0; j < n_res; j++) {
9803
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9804
0
            res[k++] = mapped[j];
9805
0
        }
9806
0
    }
9807
0
    return k;
9808
0
}
9809
9810
static Py_ssize_t
9811
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9812
0
    Py_ssize_t i, k = 0;
9813
9814
0
    for (i = 0; i < length; i++) {
9815
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9816
0
        int n_res, j;
9817
0
        if (Py_UNICODE_ISUPPER(c)) {
9818
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9819
0
        }
9820
0
        else if (Py_UNICODE_ISLOWER(c)) {
9821
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9822
0
        }
9823
0
        else {
9824
0
            n_res = 1;
9825
0
            mapped[0] = c;
9826
0
        }
9827
0
        for (j = 0; j < n_res; j++) {
9828
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9829
0
            res[k++] = mapped[j];
9830
0
        }
9831
0
    }
9832
0
    return k;
9833
0
}
9834
9835
static Py_ssize_t
9836
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9837
                  Py_UCS4 *maxchar, int lower)
9838
6.10M
{
9839
6.10M
    Py_ssize_t i, k = 0;
9840
9841
133M
    for (i = 0; i < length; i++) {
9842
127M
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9843
127M
        int n_res, j;
9844
127M
        if (lower)
9845
127M
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9846
0
        else
9847
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9848
254M
        for (j = 0; j < n_res; j++) {
9849
127M
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9850
127M
            res[k++] = mapped[j];
9851
127M
        }
9852
127M
    }
9853
6.10M
    return k;
9854
6.10M
}
9855
9856
static Py_ssize_t
9857
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9858
0
{
9859
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9860
0
}
9861
9862
static Py_ssize_t
9863
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9864
6.10M
{
9865
6.10M
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9866
6.10M
}
9867
9868
static Py_ssize_t
9869
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9870
0
{
9871
0
    Py_ssize_t i, k = 0;
9872
9873
0
    for (i = 0; i < length; i++) {
9874
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9875
0
        Py_UCS4 mapped[3];
9876
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9877
0
        for (j = 0; j < n_res; j++) {
9878
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9879
0
            res[k++] = mapped[j];
9880
0
        }
9881
0
    }
9882
0
    return k;
9883
0
}
9884
9885
static Py_ssize_t
9886
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9887
0
{
9888
0
    Py_ssize_t i, k = 0;
9889
0
    int previous_is_cased;
9890
9891
0
    previous_is_cased = 0;
9892
0
    for (i = 0; i < length; i++) {
9893
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9894
0
        Py_UCS4 mapped[3];
9895
0
        int n_res, j;
9896
9897
0
        if (previous_is_cased)
9898
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9899
0
        else
9900
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
9901
9902
0
        for (j = 0; j < n_res; j++) {
9903
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9904
0
            res[k++] = mapped[j];
9905
0
        }
9906
9907
0
        previous_is_cased = _PyUnicode_IsCased(c);
9908
0
    }
9909
0
    return k;
9910
0
}
9911
9912
static PyObject *
9913
case_operation(PyObject *self,
9914
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9915
6.10M
{
9916
6.10M
    PyObject *res = NULL;
9917
6.10M
    Py_ssize_t length, newlength = 0;
9918
6.10M
    int kind, outkind;
9919
6.10M
    const void *data;
9920
6.10M
    void *outdata;
9921
6.10M
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9922
9923
6.10M
    kind = PyUnicode_KIND(self);
9924
6.10M
    data = PyUnicode_DATA(self);
9925
6.10M
    length = PyUnicode_GET_LENGTH(self);
9926
6.10M
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9927
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
9928
0
        return NULL;
9929
0
    }
9930
6.10M
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
9931
6.10M
    if (tmp == NULL)
9932
0
        return PyErr_NoMemory();
9933
6.10M
    newlength = perform(kind, data, length, tmp, &maxchar);
9934
6.10M
    res = PyUnicode_New(newlength, maxchar);
9935
6.10M
    if (res == NULL)
9936
0
        goto leave;
9937
6.10M
    tmpend = tmp + newlength;
9938
6.10M
    outdata = PyUnicode_DATA(res);
9939
6.10M
    outkind = PyUnicode_KIND(res);
9940
6.10M
    switch (outkind) {
9941
195k
    case PyUnicode_1BYTE_KIND:
9942
195k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9943
195k
        break;
9944
5.73M
    case PyUnicode_2BYTE_KIND:
9945
5.73M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9946
5.73M
        break;
9947
173k
    case PyUnicode_4BYTE_KIND:
9948
173k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9949
173k
        break;
9950
0
    default:
9951
0
        Py_UNREACHABLE();
9952
6.10M
    }
9953
6.10M
  leave:
9954
6.10M
    PyMem_Free(tmp);
9955
6.10M
    return res;
9956
6.10M
}
9957
9958
PyObject *
9959
PyUnicode_Join(PyObject *separator, PyObject *seq)
9960
27.0M
{
9961
27.0M
    PyObject *res;
9962
27.0M
    PyObject *fseq;
9963
27.0M
    Py_ssize_t seqlen;
9964
27.0M
    PyObject **items;
9965
9966
27.0M
    fseq = PySequence_Fast(seq, "can only join an iterable");
9967
27.0M
    if (fseq == NULL) {
9968
611
        return NULL;
9969
611
    }
9970
9971
27.0M
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
9972
9973
27.0M
    items = PySequence_Fast_ITEMS(fseq);
9974
27.0M
    seqlen = PySequence_Fast_GET_SIZE(fseq);
9975
27.0M
    res = _PyUnicode_JoinArray(separator, items, seqlen);
9976
9977
27.0M
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
9978
9979
27.0M
    Py_DECREF(fseq);
9980
27.0M
    return res;
9981
27.0M
}
9982
9983
PyObject *
9984
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9985
45.8M
{
9986
45.8M
    PyObject *res = NULL; /* the result */
9987
45.8M
    PyObject *sep = NULL;
9988
45.8M
    Py_ssize_t seplen;
9989
45.8M
    PyObject *item;
9990
45.8M
    Py_ssize_t sz, i, res_offset;
9991
45.8M
    Py_UCS4 maxchar;
9992
45.8M
    Py_UCS4 item_maxchar;
9993
45.8M
    int use_memcpy;
9994
45.8M
    unsigned char *res_data = NULL, *sep_data = NULL;
9995
45.8M
    PyObject *last_obj;
9996
45.8M
    int kind = 0;
9997
9998
    /* If empty sequence, return u"". */
9999
45.8M
    if (seqlen == 0) {
10000
6.78M
        _Py_RETURN_UNICODE_EMPTY();
10001
6.78M
    }
10002
10003
    /* If singleton sequence with an exact Unicode, return that. */
10004
39.0M
    last_obj = NULL;
10005
39.0M
    if (seqlen == 1) {
10006
14.4M
        if (PyUnicode_CheckExact(items[0])) {
10007
13.2M
            res = items[0];
10008
13.2M
            return Py_NewRef(res);
10009
13.2M
        }
10010
1.22M
        seplen = 0;
10011
1.22M
        maxchar = 0;
10012
1.22M
    }
10013
24.6M
    else {
10014
        /* Set up sep and seplen */
10015
24.6M
        if (separator == NULL) {
10016
            /* fall back to a blank space separator */
10017
0
            sep = PyUnicode_FromOrdinal(' ');
10018
0
            if (!sep)
10019
0
                goto onError;
10020
0
            seplen = 1;
10021
0
            maxchar = 32;
10022
0
        }
10023
24.6M
        else {
10024
24.6M
            if (!PyUnicode_Check(separator)) {
10025
0
                PyErr_Format(PyExc_TypeError,
10026
0
                             "separator: expected str instance,"
10027
0
                             " %.80s found",
10028
0
                             Py_TYPE(separator)->tp_name);
10029
0
                goto onError;
10030
0
            }
10031
24.6M
            sep = separator;
10032
24.6M
            seplen = PyUnicode_GET_LENGTH(separator);
10033
24.6M
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10034
            /* inc refcount to keep this code path symmetric with the
10035
               above case of a blank separator */
10036
24.6M
            Py_INCREF(sep);
10037
24.6M
        }
10038
24.6M
        last_obj = sep;
10039
24.6M
    }
10040
10041
    /* There are at least two things to join, or else we have a subclass
10042
     * of str in the sequence.
10043
     * Do a pre-pass to figure out the total amount of space we'll
10044
     * need (sz), and see whether all argument are strings.
10045
     */
10046
25.8M
    sz = 0;
10047
#ifdef Py_DEBUG
10048
    use_memcpy = 0;
10049
#else
10050
25.8M
    use_memcpy = 1;
10051
25.8M
#endif
10052
224M
    for (i = 0; i < seqlen; i++) {
10053
198M
        size_t add_sz;
10054
198M
        item = items[i];
10055
198M
        if (!PyUnicode_Check(item)) {
10056
0
            PyErr_Format(PyExc_TypeError,
10057
0
                         "sequence item %zd: expected str instance,"
10058
0
                         " %.80s found",
10059
0
                         i, Py_TYPE(item)->tp_name);
10060
0
            goto onError;
10061
0
        }
10062
198M
        add_sz = PyUnicode_GET_LENGTH(item);
10063
198M
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10064
198M
        maxchar = Py_MAX(maxchar, item_maxchar);
10065
198M
        if (i != 0) {
10066
172M
            add_sz += seplen;
10067
172M
        }
10068
198M
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10069
0
            PyErr_SetString(PyExc_OverflowError,
10070
0
                            "join() result is too long for a Python string");
10071
0
            goto onError;
10072
0
        }
10073
198M
        sz += add_sz;
10074
198M
        if (use_memcpy && last_obj != NULL) {
10075
129M
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10076
2.89M
                use_memcpy = 0;
10077
129M
        }
10078
198M
        last_obj = item;
10079
198M
    }
10080
10081
25.8M
    res = PyUnicode_New(sz, maxchar);
10082
25.8M
    if (res == NULL)
10083
0
        goto onError;
10084
10085
    /* Catenate everything. */
10086
#ifdef Py_DEBUG
10087
    use_memcpy = 0;
10088
#else
10089
25.8M
    if (use_memcpy) {
10090
22.9M
        res_data = PyUnicode_1BYTE_DATA(res);
10091
22.9M
        kind = PyUnicode_KIND(res);
10092
22.9M
        if (seplen != 0)
10093
233k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10094
22.9M
    }
10095
25.8M
#endif
10096
25.8M
    if (use_memcpy) {
10097
135M
        for (i = 0; i < seqlen; ++i) {
10098
113M
            Py_ssize_t itemlen;
10099
113M
            item = items[i];
10100
10101
            /* Copy item, and maybe the separator. */
10102
113M
            if (i && seplen != 0) {
10103
801k
                memcpy(res_data,
10104
801k
                          sep_data,
10105
801k
                          kind * seplen);
10106
801k
                res_data += kind * seplen;
10107
801k
            }
10108
10109
113M
            itemlen = PyUnicode_GET_LENGTH(item);
10110
113M
            if (itemlen != 0) {
10111
101M
                memcpy(res_data,
10112
101M
                          PyUnicode_DATA(item),
10113
101M
                          kind * itemlen);
10114
101M
                res_data += kind * itemlen;
10115
101M
            }
10116
113M
        }
10117
22.9M
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10118
22.9M
                           + kind * PyUnicode_GET_LENGTH(res));
10119
22.9M
    }
10120
2.89M
    else {
10121
88.5M
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10122
85.7M
            Py_ssize_t itemlen;
10123
85.7M
            item = items[i];
10124
10125
            /* Copy item, and maybe the separator. */
10126
85.7M
            if (i && seplen != 0) {
10127
2.05M
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10128
2.05M
                res_offset += seplen;
10129
2.05M
            }
10130
10131
85.7M
            itemlen = PyUnicode_GET_LENGTH(item);
10132
85.7M
            if (itemlen != 0) {
10133
84.2M
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10134
84.2M
                res_offset += itemlen;
10135
84.2M
            }
10136
85.7M
        }
10137
2.89M
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10138
2.89M
    }
10139
10140
25.8M
    Py_XDECREF(sep);
10141
25.8M
    assert(_PyUnicode_CheckConsistency(res, 1));
10142
25.8M
    return res;
10143
10144
0
  onError:
10145
0
    Py_XDECREF(sep);
10146
0
    Py_XDECREF(res);
10147
0
    return NULL;
10148
25.8M
}
10149
10150
void
10151
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10152
                    Py_UCS4 fill_char)
10153
17.6k
{
10154
17.6k
    const int kind = PyUnicode_KIND(unicode);
10155
17.6k
    void *data = PyUnicode_DATA(unicode);
10156
17.6k
    assert(_PyUnicode_IsModifiable(unicode));
10157
17.6k
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10158
17.6k
    assert(start >= 0);
10159
17.6k
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10160
17.6k
    _PyUnicode_Fill(kind, data, fill_char, start, length);
10161
17.6k
}
10162
10163
Py_ssize_t
10164
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10165
               Py_UCS4 fill_char)
10166
634
{
10167
634
    Py_ssize_t maxlen;
10168
10169
634
    if (!PyUnicode_Check(unicode)) {
10170
0
        PyErr_BadInternalCall();
10171
0
        return -1;
10172
0
    }
10173
634
    if (unicode_check_modifiable(unicode))
10174
0
        return -1;
10175
10176
634
    if (start < 0) {
10177
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10178
0
        return -1;
10179
0
    }
10180
634
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10181
0
        PyErr_SetString(PyExc_ValueError,
10182
0
                         "fill character is bigger than "
10183
0
                         "the string maximum character");
10184
0
        return -1;
10185
0
    }
10186
10187
634
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10188
634
    length = Py_MIN(maxlen, length);
10189
634
    if (length <= 0)
10190
0
        return 0;
10191
10192
634
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10193
634
    return length;
10194
634
}
10195
10196
static PyObject *
10197
pad(PyObject *self,
10198
    Py_ssize_t left,
10199
    Py_ssize_t right,
10200
    Py_UCS4 fill)
10201
68
{
10202
68
    PyObject *u;
10203
68
    Py_UCS4 maxchar;
10204
68
    int kind;
10205
68
    void *data;
10206
10207
68
    if (left < 0)
10208
0
        left = 0;
10209
68
    if (right < 0)
10210
0
        right = 0;
10211
10212
68
    if (left == 0 && right == 0)
10213
0
        return unicode_result_unchanged(self);
10214
10215
68
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10216
68
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10217
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10218
0
        return NULL;
10219
0
    }
10220
68
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10221
68
    maxchar = Py_MAX(maxchar, fill);
10222
68
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10223
68
    if (!u)
10224
0
        return NULL;
10225
10226
68
    kind = PyUnicode_KIND(u);
10227
68
    data = PyUnicode_DATA(u);
10228
68
    if (left)
10229
0
        _PyUnicode_Fill(kind, data, fill, 0, left);
10230
68
    if (right)
10231
68
        _PyUnicode_Fill(kind, data, fill,
10232
68
                        left + _PyUnicode_LENGTH(self), right);
10233
68
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10234
68
    assert(_PyUnicode_CheckConsistency(u, 1));
10235
68
    return u;
10236
68
}
10237
10238
PyObject *
10239
PyUnicode_Splitlines(PyObject *string, int keepends)
10240
18.1k
{
10241
18.1k
    PyObject *list;
10242
10243
18.1k
    if (ensure_unicode(string) < 0)
10244
0
        return NULL;
10245
10246
18.1k
    switch (PyUnicode_KIND(string)) {
10247
5.25k
    case PyUnicode_1BYTE_KIND:
10248
5.25k
        if (PyUnicode_IS_ASCII(string))
10249
4.20k
            list = asciilib_splitlines(
10250
4.20k
                string, PyUnicode_1BYTE_DATA(string),
10251
4.20k
                PyUnicode_GET_LENGTH(string), keepends);
10252
1.05k
        else
10253
1.05k
            list = ucs1lib_splitlines(
10254
1.05k
                string, PyUnicode_1BYTE_DATA(string),
10255
1.05k
                PyUnicode_GET_LENGTH(string), keepends);
10256
5.25k
        break;
10257
9.42k
    case PyUnicode_2BYTE_KIND:
10258
9.42k
        list = ucs2lib_splitlines(
10259
9.42k
            string, PyUnicode_2BYTE_DATA(string),
10260
9.42k
            PyUnicode_GET_LENGTH(string), keepends);
10261
9.42k
        break;
10262
3.49k
    case PyUnicode_4BYTE_KIND:
10263
3.49k
        list = ucs4lib_splitlines(
10264
3.49k
            string, PyUnicode_4BYTE_DATA(string),
10265
3.49k
            PyUnicode_GET_LENGTH(string), keepends);
10266
3.49k
        break;
10267
0
    default:
10268
0
        Py_UNREACHABLE();
10269
18.1k
    }
10270
18.1k
    return list;
10271
18.1k
}
10272
10273
static PyObject *
10274
split(PyObject *self,
10275
      PyObject *substring,
10276
      Py_ssize_t maxcount)
10277
20.8M
{
10278
20.8M
    int kind1, kind2;
10279
20.8M
    const void *buf1, *buf2;
10280
20.8M
    Py_ssize_t len1, len2;
10281
20.8M
    PyObject* out;
10282
20.8M
    len1 = PyUnicode_GET_LENGTH(self);
10283
20.8M
    kind1 = PyUnicode_KIND(self);
10284
10285
20.8M
    if (substring == NULL) {
10286
164k
        if (maxcount < 0) {
10287
141k
            maxcount = (len1 - 1) / 2 + 1;
10288
141k
        }
10289
164k
        switch (kind1) {
10290
103k
        case PyUnicode_1BYTE_KIND:
10291
103k
            if (PyUnicode_IS_ASCII(self))
10292
77.4k
                return asciilib_split_whitespace(
10293
77.4k
                    self,  PyUnicode_1BYTE_DATA(self),
10294
77.4k
                    len1, maxcount
10295
77.4k
                    );
10296
26.2k
            else
10297
26.2k
                return ucs1lib_split_whitespace(
10298
26.2k
                    self,  PyUnicode_1BYTE_DATA(self),
10299
26.2k
                    len1, maxcount
10300
26.2k
                    );
10301
49.5k
        case PyUnicode_2BYTE_KIND:
10302
49.5k
            return ucs2lib_split_whitespace(
10303
49.5k
                self,  PyUnicode_2BYTE_DATA(self),
10304
49.5k
                len1, maxcount
10305
49.5k
                );
10306
10.9k
        case PyUnicode_4BYTE_KIND:
10307
10.9k
            return ucs4lib_split_whitespace(
10308
10.9k
                self,  PyUnicode_4BYTE_DATA(self),
10309
10.9k
                len1, maxcount
10310
10.9k
                );
10311
0
        default:
10312
0
            Py_UNREACHABLE();
10313
164k
        }
10314
164k
    }
10315
10316
20.6M
    kind2 = PyUnicode_KIND(substring);
10317
20.6M
    len2 = PyUnicode_GET_LENGTH(substring);
10318
20.6M
    if (maxcount < 0) {
10319
        // if len2 == 0, it will raise ValueError.
10320
15.0M
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10321
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10322
15.0M
        maxcount = maxcount < 0 ? len1 : maxcount;
10323
15.0M
    }
10324
20.6M
    if (kind1 < kind2 || len1 < len2) {
10325
1.28M
        out = PyList_New(1);
10326
1.28M
        if (out == NULL)
10327
0
            return NULL;
10328
1.28M
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10329
1.28M
        return out;
10330
1.28M
    }
10331
19.3M
    buf1 = PyUnicode_DATA(self);
10332
19.3M
    buf2 = PyUnicode_DATA(substring);
10333
19.3M
    if (kind2 != kind1) {
10334
245k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10335
245k
        if (!buf2)
10336
0
            return NULL;
10337
245k
    }
10338
10339
19.3M
    switch (kind1) {
10340
19.1M
    case PyUnicode_1BYTE_KIND:
10341
19.1M
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10342
17.9M
            out = asciilib_split(
10343
17.9M
                self,  buf1, len1, buf2, len2, maxcount);
10344
1.12M
        else
10345
1.12M
            out = ucs1lib_split(
10346
1.12M
                self,  buf1, len1, buf2, len2, maxcount);
10347
19.1M
        break;
10348
209k
    case PyUnicode_2BYTE_KIND:
10349
209k
        out = ucs2lib_split(
10350
209k
            self,  buf1, len1, buf2, len2, maxcount);
10351
209k
        break;
10352
36.2k
    case PyUnicode_4BYTE_KIND:
10353
36.2k
        out = ucs4lib_split(
10354
36.2k
            self,  buf1, len1, buf2, len2, maxcount);
10355
36.2k
        break;
10356
0
    default:
10357
0
        out = NULL;
10358
19.3M
    }
10359
19.3M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10360
19.3M
    if (kind2 != kind1)
10361
245k
        PyMem_Free((void *)buf2);
10362
19.3M
    return out;
10363
19.3M
}
10364
10365
static PyObject *
10366
rsplit(PyObject *self,
10367
       PyObject *substring,
10368
       Py_ssize_t maxcount)
10369
66
{
10370
66
    int kind1, kind2;
10371
66
    const void *buf1, *buf2;
10372
66
    Py_ssize_t len1, len2;
10373
66
    PyObject* out;
10374
10375
66
    len1 = PyUnicode_GET_LENGTH(self);
10376
66
    kind1 = PyUnicode_KIND(self);
10377
10378
66
    if (substring == NULL) {
10379
0
        if (maxcount < 0) {
10380
0
            maxcount = (len1 - 1) / 2 + 1;
10381
0
        }
10382
0
        switch (kind1) {
10383
0
        case PyUnicode_1BYTE_KIND:
10384
0
            if (PyUnicode_IS_ASCII(self))
10385
0
                return asciilib_rsplit_whitespace(
10386
0
                    self,  PyUnicode_1BYTE_DATA(self),
10387
0
                    len1, maxcount
10388
0
                    );
10389
0
            else
10390
0
                return ucs1lib_rsplit_whitespace(
10391
0
                    self,  PyUnicode_1BYTE_DATA(self),
10392
0
                    len1, maxcount
10393
0
                    );
10394
0
        case PyUnicode_2BYTE_KIND:
10395
0
            return ucs2lib_rsplit_whitespace(
10396
0
                self,  PyUnicode_2BYTE_DATA(self),
10397
0
                len1, maxcount
10398
0
                );
10399
0
        case PyUnicode_4BYTE_KIND:
10400
0
            return ucs4lib_rsplit_whitespace(
10401
0
                self,  PyUnicode_4BYTE_DATA(self),
10402
0
                len1, maxcount
10403
0
                );
10404
0
        default:
10405
0
            Py_UNREACHABLE();
10406
0
        }
10407
0
    }
10408
66
    kind2 = PyUnicode_KIND(substring);
10409
66
    len2 = PyUnicode_GET_LENGTH(substring);
10410
66
    if (maxcount < 0) {
10411
        // if len2 == 0, it will raise ValueError.
10412
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10413
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10414
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10415
0
    }
10416
66
    if (kind1 < kind2 || len1 < len2) {
10417
0
        out = PyList_New(1);
10418
0
        if (out == NULL)
10419
0
            return NULL;
10420
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10421
0
        return out;
10422
0
    }
10423
66
    buf1 = PyUnicode_DATA(self);
10424
66
    buf2 = PyUnicode_DATA(substring);
10425
66
    if (kind2 != kind1) {
10426
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10427
0
        if (!buf2)
10428
0
            return NULL;
10429
0
    }
10430
10431
66
    switch (kind1) {
10432
66
    case PyUnicode_1BYTE_KIND:
10433
66
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10434
66
            out = asciilib_rsplit(
10435
66
                self,  buf1, len1, buf2, len2, maxcount);
10436
0
        else
10437
0
            out = ucs1lib_rsplit(
10438
0
                self,  buf1, len1, buf2, len2, maxcount);
10439
66
        break;
10440
0
    case PyUnicode_2BYTE_KIND:
10441
0
        out = ucs2lib_rsplit(
10442
0
            self,  buf1, len1, buf2, len2, maxcount);
10443
0
        break;
10444
0
    case PyUnicode_4BYTE_KIND:
10445
0
        out = ucs4lib_rsplit(
10446
0
            self,  buf1, len1, buf2, len2, maxcount);
10447
0
        break;
10448
0
    default:
10449
0
        out = NULL;
10450
66
    }
10451
66
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10452
66
    if (kind2 != kind1)
10453
0
        PyMem_Free((void *)buf2);
10454
66
    return out;
10455
66
}
10456
10457
static Py_ssize_t
10458
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10459
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10460
23.2M
{
10461
23.2M
    switch (kind) {
10462
8.79M
    case PyUnicode_1BYTE_KIND:
10463
8.79M
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10464
4.46M
            return asciilib_find(buf1, len1, buf2, len2, offset);
10465
4.33M
        else
10466
4.33M
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10467
6.94M
    case PyUnicode_2BYTE_KIND:
10468
6.94M
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10469
7.50M
    case PyUnicode_4BYTE_KIND:
10470
7.50M
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10471
23.2M
    }
10472
23.2M
    Py_UNREACHABLE();
10473
23.2M
}
10474
10475
static Py_ssize_t
10476
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10477
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10478
12.0M
{
10479
12.0M
    switch (kind) {
10480
11.2M
    case PyUnicode_1BYTE_KIND:
10481
11.2M
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10482
742k
    case PyUnicode_2BYTE_KIND:
10483
742k
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10484
83.9k
    case PyUnicode_4BYTE_KIND:
10485
83.9k
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10486
12.0M
    }
10487
12.0M
    Py_UNREACHABLE();
10488
12.0M
}
10489
10490
static void
10491
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10492
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10493
59.9k
{
10494
59.9k
    int kind = PyUnicode_KIND(u);
10495
59.9k
    void *data = PyUnicode_DATA(u);
10496
59.9k
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10497
59.9k
    if (kind == PyUnicode_1BYTE_KIND) {
10498
33.3k
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10499
33.3k
                                      (Py_UCS1 *)data + len,
10500
33.3k
                                      u1, u2, maxcount);
10501
33.3k
    }
10502
26.6k
    else if (kind == PyUnicode_2BYTE_KIND) {
10503
22.0k
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10504
22.0k
                                      (Py_UCS2 *)data + len,
10505
22.0k
                                      u1, u2, maxcount);
10506
22.0k
    }
10507
4.61k
    else {
10508
4.61k
        assert(kind == PyUnicode_4BYTE_KIND);
10509
4.61k
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10510
4.61k
                                      (Py_UCS4 *)data + len,
10511
4.61k
                                      u1, u2, maxcount);
10512
4.61k
    }
10513
59.9k
}
10514
10515
static PyObject *
10516
replace(PyObject *self, PyObject *str1,
10517
        PyObject *str2, Py_ssize_t maxcount)
10518
20.2M
{
10519
20.2M
    PyObject *u;
10520
20.2M
    const char *sbuf = PyUnicode_DATA(self);
10521
20.2M
    const void *buf1 = PyUnicode_DATA(str1);
10522
20.2M
    const void *buf2 = PyUnicode_DATA(str2);
10523
20.2M
    int srelease = 0, release1 = 0, release2 = 0;
10524
20.2M
    int skind = PyUnicode_KIND(self);
10525
20.2M
    int kind1 = PyUnicode_KIND(str1);
10526
20.2M
    int kind2 = PyUnicode_KIND(str2);
10527
20.2M
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10528
20.2M
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10529
20.2M
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10530
20.2M
    int mayshrink;
10531
20.2M
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10532
10533
20.2M
    if (slen < len1)
10534
7.69M
        goto nothing;
10535
10536
12.5M
    if (maxcount < 0)
10537
12.5M
        maxcount = PY_SSIZE_T_MAX;
10538
0
    else if (maxcount == 0)
10539
0
        goto nothing;
10540
10541
12.5M
    if (str1 == str2)
10542
0
        goto nothing;
10543
10544
12.5M
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10545
12.5M
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10546
12.5M
    if (maxchar < maxchar_str1)
10547
        /* substring too wide to be present */
10548
0
        goto nothing;
10549
12.5M
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10550
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10551
       result string. */
10552
12.5M
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10553
12.5M
    maxchar = Py_MAX(maxchar, maxchar_str2);
10554
10555
12.5M
    if (len1 == len2) {
10556
        /* same length */
10557
480k
        if (len1 == 0)
10558
0
            goto nothing;
10559
480k
        if (len1 == 1) {
10560
            /* replace characters */
10561
473k
            Py_UCS4 u1, u2;
10562
473k
            Py_ssize_t pos;
10563
10564
473k
            u1 = PyUnicode_READ(kind1, buf1, 0);
10565
473k
            pos = findchar(sbuf, skind, slen, u1, 1);
10566
473k
            if (pos < 0)
10567
413k
                goto nothing;
10568
59.9k
            u2 = PyUnicode_READ(kind2, buf2, 0);
10569
59.9k
            u = PyUnicode_New(slen, maxchar);
10570
59.9k
            if (!u)
10571
0
                goto error;
10572
10573
59.9k
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10574
59.9k
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10575
59.9k
        }
10576
7.39k
        else {
10577
7.39k
            int rkind = skind;
10578
7.39k
            char *res;
10579
7.39k
            Py_ssize_t i;
10580
10581
7.39k
            if (kind1 < rkind) {
10582
                /* widen substring */
10583
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10584
0
                if (!buf1) goto error;
10585
0
                release1 = 1;
10586
0
            }
10587
7.39k
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10588
7.39k
            if (i < 0)
10589
7.39k
                goto nothing;
10590
0
            if (rkind > kind2) {
10591
                /* widen replacement */
10592
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10593
0
                if (!buf2) goto error;
10594
0
                release2 = 1;
10595
0
            }
10596
0
            else if (rkind < kind2) {
10597
                /* widen self and buf1 */
10598
0
                rkind = kind2;
10599
0
                if (release1) {
10600
0
                    assert(buf1 != PyUnicode_DATA(str1));
10601
0
                    PyMem_Free((void *)buf1);
10602
0
                    buf1 = PyUnicode_DATA(str1);
10603
0
                    release1 = 0;
10604
0
                }
10605
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10606
0
                if (!sbuf) goto error;
10607
0
                srelease = 1;
10608
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10609
0
                if (!buf1) goto error;
10610
0
                release1 = 1;
10611
0
            }
10612
0
            u = PyUnicode_New(slen, maxchar);
10613
0
            if (!u)
10614
0
                goto error;
10615
0
            assert(PyUnicode_KIND(u) == rkind);
10616
0
            res = PyUnicode_DATA(u);
10617
10618
0
            memcpy(res, sbuf, rkind * slen);
10619
            /* change everything in-place, starting with this one */
10620
0
            memcpy(res + rkind * i,
10621
0
                   buf2,
10622
0
                   rkind * len2);
10623
0
            i += len1;
10624
10625
0
            while ( --maxcount > 0) {
10626
0
                i = anylib_find(rkind, self,
10627
0
                                sbuf+rkind*i, slen-i,
10628
0
                                str1, buf1, len1, i);
10629
0
                if (i == -1)
10630
0
                    break;
10631
0
                memcpy(res + rkind * i,
10632
0
                       buf2,
10633
0
                       rkind * len2);
10634
0
                i += len1;
10635
0
            }
10636
0
        }
10637
480k
    }
10638
12.0M
    else {
10639
12.0M
        Py_ssize_t n, i, j, ires;
10640
12.0M
        Py_ssize_t new_size;
10641
12.0M
        int rkind = skind;
10642
12.0M
        char *res;
10643
10644
12.0M
        if (kind1 < rkind) {
10645
            /* widen substring */
10646
825k
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10647
825k
            if (!buf1) goto error;
10648
825k
            release1 = 1;
10649
825k
        }
10650
12.0M
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10651
12.0M
        if (n == 0)
10652
10.6M
            goto nothing;
10653
1.40M
        if (kind2 < rkind) {
10654
            /* widen replacement */
10655
48.5k
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10656
48.5k
            if (!buf2) goto error;
10657
48.5k
            release2 = 1;
10658
48.5k
        }
10659
1.35M
        else if (kind2 > rkind) {
10660
            /* widen self and buf1 */
10661
0
            rkind = kind2;
10662
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10663
0
            if (!sbuf) goto error;
10664
0
            srelease = 1;
10665
0
            if (release1) {
10666
0
                assert(buf1 != PyUnicode_DATA(str1));
10667
0
                PyMem_Free((void *)buf1);
10668
0
                buf1 = PyUnicode_DATA(str1);
10669
0
                release1 = 0;
10670
0
            }
10671
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10672
0
            if (!buf1) goto error;
10673
0
            release1 = 1;
10674
0
        }
10675
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10676
           PyUnicode_GET_LENGTH(str1)); */
10677
1.40M
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10678
0
                PyErr_SetString(PyExc_OverflowError,
10679
0
                                "replace string is too long");
10680
0
                goto error;
10681
0
        }
10682
1.40M
        new_size = slen + n * (len2 - len1);
10683
1.40M
        if (new_size == 0) {
10684
0
            u = _PyUnicode_GetEmpty();
10685
0
            goto done;
10686
0
        }
10687
1.40M
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10688
0
            PyErr_SetString(PyExc_OverflowError,
10689
0
                            "replace string is too long");
10690
0
            goto error;
10691
0
        }
10692
1.40M
        u = PyUnicode_New(new_size, maxchar);
10693
1.40M
        if (!u)
10694
0
            goto error;
10695
1.40M
        assert(PyUnicode_KIND(u) == rkind);
10696
1.40M
        res = PyUnicode_DATA(u);
10697
1.40M
        ires = i = 0;
10698
1.40M
        if (len1 > 0) {
10699
24.6M
            while (n-- > 0) {
10700
                /* look for next match */
10701
23.2M
                j = anylib_find(rkind, self,
10702
23.2M
                                sbuf + rkind * i, slen-i,
10703
23.2M
                                str1, buf1, len1, i);
10704
23.2M
                if (j == -1)
10705
0
                    break;
10706
23.2M
                else if (j > i) {
10707
                    /* copy unchanged part [i:j] */
10708
4.92M
                    memcpy(res + rkind * ires,
10709
4.92M
                           sbuf + rkind * i,
10710
4.92M
                           rkind * (j-i));
10711
4.92M
                    ires += j - i;
10712
4.92M
                }
10713
                /* copy substitution string */
10714
23.2M
                if (len2 > 0) {
10715
23.2M
                    memcpy(res + rkind * ires,
10716
23.2M
                           buf2,
10717
23.2M
                           rkind * len2);
10718
23.2M
                    ires += len2;
10719
23.2M
                }
10720
23.2M
                i = j + len1;
10721
23.2M
            }
10722
1.40M
            if (i < slen)
10723
                /* copy tail [i:] */
10724
1.39M
                memcpy(res + rkind * ires,
10725
1.39M
                       sbuf + rkind * i,
10726
1.39M
                       rkind * (slen-i));
10727
1.40M
        }
10728
0
        else {
10729
            /* interleave */
10730
0
            while (n > 0) {
10731
0
                memcpy(res + rkind * ires,
10732
0
                       buf2,
10733
0
                       rkind * len2);
10734
0
                ires += len2;
10735
0
                if (--n <= 0)
10736
0
                    break;
10737
0
                memcpy(res + rkind * ires,
10738
0
                       sbuf + rkind * i,
10739
0
                       rkind);
10740
0
                ires++;
10741
0
                i++;
10742
0
            }
10743
0
            memcpy(res + rkind * ires,
10744
0
                   sbuf + rkind * i,
10745
0
                   rkind * (slen-i));
10746
0
        }
10747
1.40M
    }
10748
10749
1.46M
    if (mayshrink) {
10750
0
        unicode_adjust_maxchar(&u);
10751
0
        if (u == NULL)
10752
0
            goto error;
10753
0
    }
10754
10755
1.46M
  done:
10756
1.46M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10757
1.46M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10758
1.46M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10759
1.46M
    if (srelease)
10760
0
        PyMem_Free((void *)sbuf);
10761
1.46M
    if (release1)
10762
48.5k
        PyMem_Free((void *)buf1);
10763
1.46M
    if (release2)
10764
48.5k
        PyMem_Free((void *)buf2);
10765
1.46M
    assert(_PyUnicode_CheckConsistency(u, 1));
10766
1.46M
    return u;
10767
10768
18.7M
  nothing:
10769
    /* nothing to replace; return original string (when possible) */
10770
18.7M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10771
18.7M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10772
18.7M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10773
18.7M
    if (srelease)
10774
0
        PyMem_Free((void *)sbuf);
10775
18.7M
    if (release1)
10776
777k
        PyMem_Free((void *)buf1);
10777
18.7M
    if (release2)
10778
0
        PyMem_Free((void *)buf2);
10779
18.7M
    return unicode_result_unchanged(self);
10780
10781
0
  error:
10782
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10783
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10784
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10785
0
    if (srelease)
10786
0
        PyMem_Free((void *)sbuf);
10787
0
    if (release1)
10788
0
        PyMem_Free((void *)buf1);
10789
0
    if (release2)
10790
0
        PyMem_Free((void *)buf2);
10791
0
    return NULL;
10792
1.46M
}
10793
10794
/* --- Unicode Object Methods --------------------------------------------- */
10795
10796
/*[clinic input]
10797
@permit_long_docstring_body
10798
str.title as unicode_title
10799
10800
Return a version of the string where each word is titlecased.
10801
10802
More specifically, words start with uppercased characters and all remaining
10803
cased characters have lower case.
10804
[clinic start generated code]*/
10805
10806
static PyObject *
10807
unicode_title_impl(PyObject *self)
10808
/*[clinic end generated code: output=c75ae03809574902 input=533ce0eb6a7f5d1b]*/
10809
0
{
10810
0
    return case_operation(self, do_title);
10811
0
}
10812
10813
/*[clinic input]
10814
@permit_long_docstring_body
10815
str.capitalize as unicode_capitalize
10816
10817
Return a capitalized version of the string.
10818
10819
More specifically, make the first character have upper case and the rest lower
10820
case.
10821
[clinic start generated code]*/
10822
10823
static PyObject *
10824
unicode_capitalize_impl(PyObject *self)
10825
/*[clinic end generated code: output=e49a4c333cdb7667 input=a4a15ade41f6f9e9]*/
10826
0
{
10827
0
    if (PyUnicode_GET_LENGTH(self) == 0)
10828
0
        return unicode_result_unchanged(self);
10829
0
    return case_operation(self, do_capitalize);
10830
0
}
10831
10832
/*[clinic input]
10833
str.casefold as unicode_casefold
10834
10835
Return a version of the string suitable for caseless comparisons.
10836
[clinic start generated code]*/
10837
10838
static PyObject *
10839
unicode_casefold_impl(PyObject *self)
10840
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10841
0
{
10842
0
    if (PyUnicode_IS_ASCII(self))
10843
0
        return ascii_upper_or_lower(self, 1);
10844
0
    return case_operation(self, do_casefold);
10845
0
}
10846
10847
10848
/* Argument converter. Accepts a single Unicode character. */
10849
10850
static int
10851
convert_uc(PyObject *obj, void *addr)
10852
130
{
10853
130
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10854
10855
130
    if (!PyUnicode_Check(obj)) {
10856
0
        PyErr_Format(PyExc_TypeError,
10857
0
                     "The fill character must be a unicode character, "
10858
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
10859
0
        return 0;
10860
0
    }
10861
130
    if (PyUnicode_GET_LENGTH(obj) != 1) {
10862
0
        PyErr_SetString(PyExc_TypeError,
10863
0
                        "The fill character must be exactly one character long");
10864
0
        return 0;
10865
0
    }
10866
130
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10867
130
    return 1;
10868
130
}
10869
10870
/*[clinic input]
10871
str.center as unicode_center
10872
10873
    width: Py_ssize_t
10874
    fillchar: Py_UCS4 = ' '
10875
    /
10876
10877
Return a centered string of length width.
10878
10879
Padding is done using the specified fill character (default is a space).
10880
[clinic start generated code]*/
10881
10882
static PyObject *
10883
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10884
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10885
0
{
10886
0
    Py_ssize_t marg, left;
10887
10888
0
    if (PyUnicode_GET_LENGTH(self) >= width)
10889
0
        return unicode_result_unchanged(self);
10890
10891
0
    marg = width - PyUnicode_GET_LENGTH(self);
10892
0
    left = marg / 2 + (marg & width & 1);
10893
10894
0
    return pad(self, left, marg - left, fillchar);
10895
0
}
10896
10897
/* This function assumes that str1 and str2 are readied by the caller. */
10898
10899
static int
10900
unicode_compare(PyObject *str1, PyObject *str2)
10901
32.7M
{
10902
32.7M
#define COMPARE(TYPE1, TYPE2) \
10903
32.7M
    do { \
10904
30.0M
        TYPE1* p1 = (TYPE1 *)data1; \
10905
30.0M
        TYPE2* p2 = (TYPE2 *)data2; \
10906
30.0M
        TYPE1* end = p1 + len; \
10907
30.0M
        Py_UCS4 c1, c2; \
10908
30.0M
        for (; p1 != end; p1++, p2++) { \
10909
30.0M
            c1 = *p1; \
10910
30.0M
            c2 = *p2; \
10911
30.0M
            if (c1 != c2) \
10912
30.0M
                return (c1 < c2) ? -1 : 1; \
10913
30.0M
        } \
10914
30.0M
    } \
10915
30.0M
    while (0)
10916
10917
32.7M
    int kind1, kind2;
10918
32.7M
    const void *data1, *data2;
10919
32.7M
    Py_ssize_t len1, len2, len;
10920
10921
32.7M
    kind1 = PyUnicode_KIND(str1);
10922
32.7M
    kind2 = PyUnicode_KIND(str2);
10923
32.7M
    data1 = PyUnicode_DATA(str1);
10924
32.7M
    data2 = PyUnicode_DATA(str2);
10925
32.7M
    len1 = PyUnicode_GET_LENGTH(str1);
10926
32.7M
    len2 = PyUnicode_GET_LENGTH(str2);
10927
32.7M
    len = Py_MIN(len1, len2);
10928
10929
32.7M
    switch(kind1) {
10930
4.34M
    case PyUnicode_1BYTE_KIND:
10931
4.34M
    {
10932
4.34M
        switch(kind2) {
10933
443k
        case PyUnicode_1BYTE_KIND:
10934
443k
        {
10935
443k
            int cmp = memcmp(data1, data2, len);
10936
            /* normalize result of memcmp() into the range [-1; 1] */
10937
443k
            if (cmp < 0)
10938
379k
                return -1;
10939
64.6k
            if (cmp > 0)
10940
58.5k
                return 1;
10941
6.16k
            break;
10942
64.6k
        }
10943
3.40M
        case PyUnicode_2BYTE_KIND:
10944
3.40M
            COMPARE(Py_UCS1, Py_UCS2);
10945
0
            break;
10946
496k
        case PyUnicode_4BYTE_KIND:
10947
496k
            COMPARE(Py_UCS1, Py_UCS4);
10948
0
            break;
10949
0
        default:
10950
0
            Py_UNREACHABLE();
10951
4.34M
        }
10952
6.16k
        break;
10953
4.34M
    }
10954
25.7M
    case PyUnicode_2BYTE_KIND:
10955
25.7M
    {
10956
25.7M
        switch(kind2) {
10957
81.4k
        case PyUnicode_1BYTE_KIND:
10958
81.4k
            COMPARE(Py_UCS2, Py_UCS1);
10959
0
            break;
10960
25.0M
        case PyUnicode_2BYTE_KIND:
10961
25.0M
        {
10962
25.0M
            COMPARE(Py_UCS2, Py_UCS2);
10963
0
            break;
10964
25.0M
        }
10965
611k
        case PyUnicode_4BYTE_KIND:
10966
611k
            COMPARE(Py_UCS2, Py_UCS4);
10967
0
            break;
10968
0
        default:
10969
0
            Py_UNREACHABLE();
10970
25.7M
        }
10971
0
        break;
10972
25.7M
    }
10973
2.62M
    case PyUnicode_4BYTE_KIND:
10974
2.62M
    {
10975
2.62M
        switch(kind2) {
10976
8.14k
        case PyUnicode_1BYTE_KIND:
10977
8.14k
            COMPARE(Py_UCS4, Py_UCS1);
10978
0
            break;
10979
363k
        case PyUnicode_2BYTE_KIND:
10980
363k
            COMPARE(Py_UCS4, Py_UCS2);
10981
0
            break;
10982
2.25M
        case PyUnicode_4BYTE_KIND:
10983
2.25M
        {
10984
2.25M
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10985
2.25M
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10986
            /* normalize result of wmemcmp() into the range [-1; 1] */
10987
2.25M
            if (cmp < 0)
10988
1.10M
                return -1;
10989
1.14M
            if (cmp > 0)
10990
1.14M
                return 1;
10991
#else
10992
            COMPARE(Py_UCS4, Py_UCS4);
10993
#endif
10994
0
            break;
10995
1.14M
        }
10996
0
        default:
10997
0
            Py_UNREACHABLE();
10998
2.62M
        }
10999
0
        break;
11000
2.62M
    }
11001
0
    default:
11002
0
        Py_UNREACHABLE();
11003
32.7M
    }
11004
11005
6.16k
    if (len1 == len2)
11006
6.12k
        return 0;
11007
37
    if (len1 < len2)
11008
15
        return -1;
11009
22
    else
11010
22
        return 1;
11011
11012
37
#undef COMPARE
11013
37
}
11014
11015
11016
int
11017
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11018
592M
{
11019
592M
    assert(PyUnicode_Check(str1));
11020
592M
    assert(PyUnicode_Check(str2));
11021
592M
    if (str1 == str2) {
11022
81.4M
        return 1;
11023
81.4M
    }
11024
511M
    return unicode_eq(str1, str2);
11025
592M
}
11026
11027
11028
int
11029
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11030
0
{
11031
0
    if (!PyUnicode_Check(str1)) {
11032
0
        PyErr_Format(PyExc_TypeError,
11033
0
                     "first argument must be str, not %T", str1);
11034
0
        return -1;
11035
0
    }
11036
0
    if (!PyUnicode_Check(str2)) {
11037
0
        PyErr_Format(PyExc_TypeError,
11038
0
                     "second argument must be str, not %T", str2);
11039
0
        return -1;
11040
0
    }
11041
11042
0
    return _PyUnicode_Equal(str1, str2);
11043
0
}
11044
11045
11046
int
11047
PyUnicode_Compare(PyObject *left, PyObject *right)
11048
276k
{
11049
276k
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11050
        /* a string is equal to itself */
11051
276k
        if (left == right)
11052
0
            return 0;
11053
11054
276k
        return unicode_compare(left, right);
11055
276k
    }
11056
0
    PyErr_Format(PyExc_TypeError,
11057
0
                 "Can't compare %.100s and %.100s",
11058
0
                 Py_TYPE(left)->tp_name,
11059
0
                 Py_TYPE(right)->tp_name);
11060
0
    return -1;
11061
276k
}
11062
11063
int
11064
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11065
12.2M
{
11066
12.2M
    Py_ssize_t i;
11067
12.2M
    int kind;
11068
12.2M
    Py_UCS4 chr;
11069
11070
12.2M
    assert(_PyUnicode_CHECK(uni));
11071
12.2M
    kind = PyUnicode_KIND(uni);
11072
12.2M
    if (kind == PyUnicode_1BYTE_KIND) {
11073
12.2M
        const void *data = PyUnicode_1BYTE_DATA(uni);
11074
12.2M
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11075
12.2M
        size_t len, len2 = strlen(str);
11076
12.2M
        int cmp;
11077
11078
12.2M
        len = Py_MIN(len1, len2);
11079
12.2M
        cmp = memcmp(data, str, len);
11080
12.2M
        if (cmp != 0) {
11081
8.05M
            if (cmp < 0)
11082
51.6k
                return -1;
11083
8.00M
            else
11084
8.00M
                return 1;
11085
8.05M
        }
11086
4.18M
        if (len1 > len2)
11087
196
            return 1; /* uni is longer */
11088
4.18M
        if (len1 < len2)
11089
785
            return -1; /* str is longer */
11090
4.18M
        return 0;
11091
4.18M
    }
11092
1.45k
    else {
11093
1.45k
        const void *data = PyUnicode_DATA(uni);
11094
        /* Compare Unicode string and source character set string */
11095
2.57k
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11096
2.37k
            if (chr != (unsigned char)str[i])
11097
1.25k
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11098
        /* This check keeps Python strings that end in '\0' from comparing equal
11099
         to C strings identical up to that point. */
11100
203
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11101
203
            return 1; /* uni is longer */
11102
0
        if (str[i])
11103
0
            return -1; /* str is longer */
11104
0
        return 0;
11105
0
    }
11106
12.2M
}
11107
11108
int
11109
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11110
24
{
11111
24
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11112
24
}
11113
11114
int
11115
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11116
24
{
11117
24
    assert(_PyUnicode_CHECK(unicode));
11118
24
    assert(str);
11119
11120
24
    if (PyUnicode_IS_ASCII(unicode)) {
11121
24
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11122
24
        return size == len &&
11123
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11124
24
    }
11125
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11126
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11127
0
        return size == len &&
11128
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11129
0
    }
11130
11131
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11132
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11133
0
        return 0;
11134
0
    }
11135
0
    const unsigned char *s = (const unsigned char *)str;
11136
0
    const unsigned char *ends = s + (size_t)size;
11137
0
    int kind = PyUnicode_KIND(unicode);
11138
0
    const void *data = PyUnicode_DATA(unicode);
11139
    /* Compare Unicode string and UTF-8 string */
11140
0
    for (Py_ssize_t i = 0; i < len; i++) {
11141
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11142
0
        if (ch < 0x80) {
11143
0
            if (ends == s || s[0] != ch) {
11144
0
                return 0;
11145
0
            }
11146
0
            s += 1;
11147
0
        }
11148
0
        else if (ch < 0x800) {
11149
0
            if ((ends - s) < 2 ||
11150
0
                s[0] != (0xc0 | (ch >> 6)) ||
11151
0
                s[1] != (0x80 | (ch & 0x3f)))
11152
0
            {
11153
0
                return 0;
11154
0
            }
11155
0
            s += 2;
11156
0
        }
11157
0
        else if (ch < 0x10000) {
11158
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11159
0
                (ends - s) < 3 ||
11160
0
                s[0] != (0xe0 | (ch >> 12)) ||
11161
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11162
0
                s[2] != (0x80 | (ch & 0x3f)))
11163
0
            {
11164
0
                return 0;
11165
0
            }
11166
0
            s += 3;
11167
0
        }
11168
0
        else {
11169
0
            assert(ch <= MAX_UNICODE);
11170
0
            if ((ends - s) < 4 ||
11171
0
                s[0] != (0xf0 | (ch >> 18)) ||
11172
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11173
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11174
0
                s[3] != (0x80 | (ch & 0x3f)))
11175
0
            {
11176
0
                return 0;
11177
0
            }
11178
0
            s += 4;
11179
0
        }
11180
0
    }
11181
0
    return s == ends;
11182
0
}
11183
11184
int
11185
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11186
38.9M
{
11187
38.9M
    size_t len;
11188
38.9M
    assert(_PyUnicode_CHECK(unicode));
11189
38.9M
    assert(str);
11190
#ifndef NDEBUG
11191
    for (const char *p = str; *p; p++) {
11192
        assert((unsigned char)*p < 128);
11193
    }
11194
#endif
11195
38.9M
    if (!PyUnicode_IS_ASCII(unicode))
11196
172k
        return 0;
11197
38.7M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11198
38.7M
    return strlen(str) == len &&
11199
681k
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11200
38.9M
}
11201
11202
PyObject *
11203
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11204
69.7M
{
11205
69.7M
    int result;
11206
11207
69.7M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11208
226k
        Py_RETURN_NOTIMPLEMENTED;
11209
11210
69.5M
    if (left == right) {
11211
2.59k
        switch (op) {
11212
2.35k
        case Py_EQ:
11213
2.35k
        case Py_LE:
11214
2.35k
        case Py_GE:
11215
            /* a string is equal to itself */
11216
2.35k
            Py_RETURN_TRUE;
11217
238
        case Py_NE:
11218
238
        case Py_LT:
11219
238
        case Py_GT:
11220
238
            Py_RETURN_FALSE;
11221
0
        default:
11222
0
            PyErr_BadArgument();
11223
0
            return NULL;
11224
2.59k
        }
11225
2.59k
    }
11226
69.5M
    else if (op == Py_EQ || op == Py_NE) {
11227
37.1M
        result = unicode_eq(left, right);
11228
37.1M
        result ^= (op == Py_NE);
11229
37.1M
        return PyBool_FromLong(result);
11230
37.1M
    }
11231
32.4M
    else {
11232
32.4M
        result = unicode_compare(left, right);
11233
32.4M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11234
32.4M
    }
11235
69.5M
}
11236
11237
int
11238
PyUnicode_Contains(PyObject *str, PyObject *substr)
11239
215M
{
11240
215M
    int kind1, kind2;
11241
215M
    const void *buf1, *buf2;
11242
215M
    Py_ssize_t len1, len2;
11243
215M
    int result;
11244
11245
215M
    if (!PyUnicode_Check(substr)) {
11246
0
        PyErr_Format(PyExc_TypeError,
11247
0
                     "'in <string>' requires string as left operand, not %.100s",
11248
0
                     Py_TYPE(substr)->tp_name);
11249
0
        return -1;
11250
0
    }
11251
215M
    if (ensure_unicode(str) < 0)
11252
0
        return -1;
11253
11254
215M
    kind1 = PyUnicode_KIND(str);
11255
215M
    kind2 = PyUnicode_KIND(substr);
11256
215M
    if (kind1 < kind2)
11257
15.0M
        return 0;
11258
200M
    len1 = PyUnicode_GET_LENGTH(str);
11259
200M
    len2 = PyUnicode_GET_LENGTH(substr);
11260
200M
    if (len1 < len2)
11261
1.08M
        return 0;
11262
199M
    buf1 = PyUnicode_DATA(str);
11263
199M
    buf2 = PyUnicode_DATA(substr);
11264
199M
    if (len2 == 1) {
11265
177M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11266
177M
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11267
177M
        return result;
11268
177M
    }
11269
22.0M
    if (kind2 != kind1) {
11270
18.0k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11271
18.0k
        if (!buf2)
11272
0
            return -1;
11273
18.0k
    }
11274
11275
22.0M
    switch (kind1) {
11276
21.9M
    case PyUnicode_1BYTE_KIND:
11277
21.9M
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11278
21.9M
        break;
11279
13.3k
    case PyUnicode_2BYTE_KIND:
11280
13.3k
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11281
13.3k
        break;
11282
4.77k
    case PyUnicode_4BYTE_KIND:
11283
4.77k
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11284
4.77k
        break;
11285
0
    default:
11286
0
        Py_UNREACHABLE();
11287
22.0M
    }
11288
11289
22.0M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11290
22.0M
    if (kind2 != kind1)
11291
18.0k
        PyMem_Free((void *)buf2);
11292
11293
22.0M
    return result;
11294
22.0M
}
11295
11296
/* Concat to string or Unicode object giving a new Unicode object. */
11297
11298
PyObject *
11299
PyUnicode_Concat(PyObject *left, PyObject *right)
11300
25.8M
{
11301
25.8M
    PyObject *result;
11302
25.8M
    Py_UCS4 maxchar, maxchar2;
11303
25.8M
    Py_ssize_t left_len, right_len, new_len;
11304
11305
25.8M
    if (ensure_unicode(left) < 0)
11306
0
        return NULL;
11307
11308
25.8M
    if (!PyUnicode_Check(right)) {
11309
0
        PyErr_Format(PyExc_TypeError,
11310
0
            "can only concatenate str (not \"%.200s\") to str",
11311
0
            Py_TYPE(right)->tp_name);
11312
0
        return NULL;
11313
0
    }
11314
11315
    /* Shortcuts */
11316
25.8M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11317
25.8M
    if (left == empty) {
11318
437k
        return PyUnicode_FromObject(right);
11319
437k
    }
11320
25.4M
    if (right == empty) {
11321
1.73M
        return PyUnicode_FromObject(left);
11322
1.73M
    }
11323
11324
23.6M
    left_len = PyUnicode_GET_LENGTH(left);
11325
23.6M
    right_len = PyUnicode_GET_LENGTH(right);
11326
23.6M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11327
0
        PyErr_SetString(PyExc_OverflowError,
11328
0
                        "strings are too large to concat");
11329
0
        return NULL;
11330
0
    }
11331
23.6M
    new_len = left_len + right_len;
11332
11333
23.6M
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11334
23.6M
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11335
23.6M
    maxchar = Py_MAX(maxchar, maxchar2);
11336
11337
    /* Concat the two Unicode strings */
11338
23.6M
    result = PyUnicode_New(new_len, maxchar);
11339
23.6M
    if (result == NULL)
11340
0
        return NULL;
11341
23.6M
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11342
23.6M
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11343
23.6M
    assert(_PyUnicode_CheckConsistency(result, 1));
11344
23.6M
    return result;
11345
23.6M
}
11346
11347
void
11348
PyUnicode_Append(PyObject **p_left, PyObject *right)
11349
5.55M
{
11350
5.55M
    PyObject *left, *res;
11351
5.55M
    Py_UCS4 maxchar, maxchar2;
11352
5.55M
    Py_ssize_t left_len, right_len, new_len;
11353
11354
5.55M
    if (p_left == NULL) {
11355
0
        if (!PyErr_Occurred())
11356
0
            PyErr_BadInternalCall();
11357
0
        return;
11358
0
    }
11359
5.55M
    left = *p_left;
11360
5.55M
    if (right == NULL || left == NULL
11361
5.55M
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11362
0
        if (!PyErr_Occurred())
11363
0
            PyErr_BadInternalCall();
11364
0
        goto error;
11365
0
    }
11366
11367
    /* Shortcuts */
11368
5.55M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11369
5.55M
    if (left == empty) {
11370
409k
        Py_DECREF(left);
11371
409k
        *p_left = Py_NewRef(right);
11372
409k
        return;
11373
409k
    }
11374
5.14M
    if (right == empty) {
11375
12.7k
        return;
11376
12.7k
    }
11377
11378
5.13M
    left_len = PyUnicode_GET_LENGTH(left);
11379
5.13M
    right_len = PyUnicode_GET_LENGTH(right);
11380
5.13M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11381
0
        PyErr_SetString(PyExc_OverflowError,
11382
0
                        "strings are too large to concat");
11383
0
        goto error;
11384
0
    }
11385
5.13M
    new_len = left_len + right_len;
11386
11387
5.13M
    if (_PyUnicode_IsModifiable(left)
11388
5.13M
        && PyUnicode_CheckExact(right)
11389
5.13M
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11390
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11391
           to change the structure size, but characters are stored just after
11392
           the structure, and so it requires to move all characters which is
11393
           not so different than duplicating the string. */
11394
1.91M
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11395
1.91M
    {
11396
        /* append inplace */
11397
1.91M
        if (unicode_resize(p_left, new_len) != 0)
11398
0
            goto error;
11399
11400
        /* copy 'right' into the newly allocated area of 'left' */
11401
1.91M
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11402
1.91M
    }
11403
3.21M
    else {
11404
3.21M
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11405
3.21M
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11406
3.21M
        maxchar = Py_MAX(maxchar, maxchar2);
11407
11408
        /* Concat the two Unicode strings */
11409
3.21M
        res = PyUnicode_New(new_len, maxchar);
11410
3.21M
        if (res == NULL)
11411
0
            goto error;
11412
3.21M
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11413
3.21M
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11414
3.21M
        Py_DECREF(left);
11415
3.21M
        *p_left = res;
11416
3.21M
    }
11417
5.13M
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11418
5.13M
    return;
11419
11420
0
error:
11421
0
    Py_CLEAR(*p_left);
11422
0
}
11423
11424
void
11425
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11426
8
{
11427
8
    PyUnicode_Append(pleft, right);
11428
8
    Py_XDECREF(right);
11429
8
}
11430
11431
/*[clinic input]
11432
@permit_long_summary
11433
@text_signature "($self, sub[, start[, end]], /)"
11434
str.count as unicode_count -> Py_ssize_t
11435
11436
    self as str: self
11437
    sub as substr: unicode
11438
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11439
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11440
    /
11441
11442
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11443
11444
Optional arguments start and end are interpreted as in slice notation.
11445
[clinic start generated code]*/
11446
11447
static Py_ssize_t
11448
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11449
                   Py_ssize_t end)
11450
/*[clinic end generated code: output=8fcc3aef0b18edbf input=8590716ee228b935]*/
11451
26.0M
{
11452
26.0M
    assert(PyUnicode_Check(str));
11453
26.0M
    assert(PyUnicode_Check(substr));
11454
11455
26.0M
    Py_ssize_t result;
11456
26.0M
    int kind1, kind2;
11457
26.0M
    const void *buf1 = NULL, *buf2 = NULL;
11458
26.0M
    Py_ssize_t len1, len2;
11459
11460
26.0M
    kind1 = PyUnicode_KIND(str);
11461
26.0M
    kind2 = PyUnicode_KIND(substr);
11462
26.0M
    if (kind1 < kind2)
11463
0
        return 0;
11464
11465
26.0M
    len1 = PyUnicode_GET_LENGTH(str);
11466
26.0M
    len2 = PyUnicode_GET_LENGTH(substr);
11467
26.0M
    ADJUST_INDICES(start, end, len1);
11468
26.0M
    if (end - start < len2)
11469
3.75M
        return 0;
11470
11471
22.2M
    buf1 = PyUnicode_DATA(str);
11472
22.2M
    buf2 = PyUnicode_DATA(substr);
11473
22.2M
    if (kind2 != kind1) {
11474
6.06M
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11475
6.06M
        if (!buf2)
11476
0
            goto onError;
11477
6.06M
    }
11478
11479
    // We don't reuse `anylib_count` here because of the explicit casts.
11480
22.2M
    switch (kind1) {
11481
16.1M
    case PyUnicode_1BYTE_KIND:
11482
16.1M
        result = ucs1lib_count(
11483
16.1M
            ((const Py_UCS1*)buf1) + start, end - start,
11484
16.1M
            buf2, len2, PY_SSIZE_T_MAX
11485
16.1M
            );
11486
16.1M
        break;
11487
3.70M
    case PyUnicode_2BYTE_KIND:
11488
3.70M
        result = ucs2lib_count(
11489
3.70M
            ((const Py_UCS2*)buf1) + start, end - start,
11490
3.70M
            buf2, len2, PY_SSIZE_T_MAX
11491
3.70M
            );
11492
3.70M
        break;
11493
2.36M
    case PyUnicode_4BYTE_KIND:
11494
2.36M
        result = ucs4lib_count(
11495
2.36M
            ((const Py_UCS4*)buf1) + start, end - start,
11496
2.36M
            buf2, len2, PY_SSIZE_T_MAX
11497
2.36M
            );
11498
2.36M
        break;
11499
0
    default:
11500
0
        Py_UNREACHABLE();
11501
22.2M
    }
11502
11503
22.2M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11504
22.2M
    if (kind2 != kind1)
11505
6.06M
        PyMem_Free((void *)buf2);
11506
11507
22.2M
    return result;
11508
0
  onError:
11509
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11510
0
    if (kind2 != kind1)
11511
0
        PyMem_Free((void *)buf2);
11512
0
    return -1;
11513
22.2M
}
11514
11515
/*[clinic input]
11516
str.encode as unicode_encode
11517
11518
    encoding: str(c_default="NULL") = 'utf-8'
11519
        The encoding in which to encode the string.
11520
    errors: str(c_default="NULL") = 'strict'
11521
        The error handling scheme to use for encoding errors.
11522
        The default is 'strict' meaning that encoding errors raise a
11523
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11524
        'xmlcharrefreplace' as well as any other name registered with
11525
        codecs.register_error that can handle UnicodeEncodeErrors.
11526
11527
Encode the string using the codec registered for encoding.
11528
[clinic start generated code]*/
11529
11530
static PyObject *
11531
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11532
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11533
18.2M
{
11534
18.2M
    return PyUnicode_AsEncodedString(self, encoding, errors);
11535
18.2M
}
11536
11537
/*[clinic input]
11538
str.expandtabs as unicode_expandtabs
11539
11540
    tabsize: int = 8
11541
11542
Return a copy where all tab characters are expanded using spaces.
11543
11544
If tabsize is not given, a tab size of 8 characters is assumed.
11545
[clinic start generated code]*/
11546
11547
static PyObject *
11548
unicode_expandtabs_impl(PyObject *self, int tabsize)
11549
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11550
1.37M
{
11551
1.37M
    Py_ssize_t i, j, line_pos, src_len, incr;
11552
1.37M
    Py_UCS4 ch;
11553
1.37M
    PyObject *u;
11554
1.37M
    const void *src_data;
11555
1.37M
    void *dest_data;
11556
1.37M
    int kind;
11557
1.37M
    int found;
11558
11559
    /* First pass: determine size of output string */
11560
1.37M
    src_len = PyUnicode_GET_LENGTH(self);
11561
1.37M
    i = j = line_pos = 0;
11562
1.37M
    kind = PyUnicode_KIND(self);
11563
1.37M
    src_data = PyUnicode_DATA(self);
11564
1.37M
    found = 0;
11565
2.93M
    for (; i < src_len; i++) {
11566
1.56M
        ch = PyUnicode_READ(kind, src_data, i);
11567
1.56M
        if (ch == '\t') {
11568
364k
            found = 1;
11569
364k
            if (tabsize > 0) {
11570
364k
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11571
364k
                if (j > PY_SSIZE_T_MAX - incr)
11572
0
                    goto overflow;
11573
364k
                line_pos += incr;
11574
364k
                j += incr;
11575
364k
            }
11576
364k
        }
11577
1.20M
        else {
11578
1.20M
            if (j > PY_SSIZE_T_MAX - 1)
11579
0
                goto overflow;
11580
1.20M
            line_pos++;
11581
1.20M
            j++;
11582
1.20M
            if (ch == '\n' || ch == '\r')
11583
4.52k
                line_pos = 0;
11584
1.20M
        }
11585
1.56M
    }
11586
1.37M
    if (!found)
11587
1.34M
        return unicode_result_unchanged(self);
11588
11589
    /* Second pass: create output string and fill it */
11590
28.8k
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11591
28.8k
    if (!u)
11592
0
        return NULL;
11593
28.8k
    dest_data = PyUnicode_DATA(u);
11594
11595
28.8k
    i = j = line_pos = 0;
11596
11597
714k
    for (; i < src_len; i++) {
11598
685k
        ch = PyUnicode_READ(kind, src_data, i);
11599
685k
        if (ch == '\t') {
11600
364k
            if (tabsize > 0) {
11601
364k
                incr = tabsize - (line_pos % tabsize);
11602
364k
                line_pos += incr;
11603
364k
                _PyUnicode_Fill(kind, dest_data, ' ', j, incr);
11604
364k
                j += incr;
11605
364k
            }
11606
364k
        }
11607
320k
        else {
11608
320k
            line_pos++;
11609
320k
            PyUnicode_WRITE(kind, dest_data, j, ch);
11610
320k
            j++;
11611
320k
            if (ch == '\n' || ch == '\r')
11612
0
                line_pos = 0;
11613
320k
        }
11614
685k
    }
11615
28.8k
    assert (j == PyUnicode_GET_LENGTH(u));
11616
28.8k
    return unicode_result(u);
11617
11618
0
  overflow:
11619
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11620
0
    return NULL;
11621
28.8k
}
11622
11623
/*[clinic input]
11624
@permit_long_summary
11625
str.find as unicode_find = str.count
11626
11627
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11628
11629
Optional arguments start and end are interpreted as in slice notation.
11630
Return -1 on failure.
11631
[clinic start generated code]*/
11632
11633
static Py_ssize_t
11634
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11635
                  Py_ssize_t end)
11636
/*[clinic end generated code: output=51dbe6255712e278 input=3a9d650fe4c24695]*/
11637
24.7M
{
11638
24.7M
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11639
24.7M
    if (result < 0) {
11640
5.33M
        return -1;
11641
5.33M
    }
11642
19.3M
    return result;
11643
24.7M
}
11644
11645
static PyObject *
11646
unicode_getitem(PyObject *self, Py_ssize_t index)
11647
62.2M
{
11648
62.2M
    const void *data;
11649
62.2M
    int kind;
11650
62.2M
    Py_UCS4 ch;
11651
11652
62.2M
    if (!PyUnicode_Check(self)) {
11653
0
        PyErr_BadArgument();
11654
0
        return NULL;
11655
0
    }
11656
62.2M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11657
14.9k
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11658
14.9k
        return NULL;
11659
14.9k
    }
11660
62.2M
    kind = PyUnicode_KIND(self);
11661
62.2M
    data = PyUnicode_DATA(self);
11662
62.2M
    ch = PyUnicode_READ(kind, data, index);
11663
62.2M
    return unicode_char(ch);
11664
62.2M
}
11665
11666
/* Believe it or not, this produces the same value for ASCII strings
11667
   as bytes_hash(). */
11668
static Py_hash_t
11669
unicode_hash(PyObject *self)
11670
118M
{
11671
118M
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11672
11673
#ifdef Py_DEBUG
11674
    assert(_Py_HashSecret_Initialized);
11675
#endif
11676
118M
    Py_hash_t hash = PyUnicode_HASH(self);
11677
118M
    if (hash != -1) {
11678
70.0M
        return hash;
11679
70.0M
    }
11680
48.2M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11681
48.2M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11682
11683
48.2M
    PyUnicode_SET_HASH(self, x);
11684
48.2M
    return x;
11685
118M
}
11686
11687
/*[clinic input]
11688
@permit_long_summary
11689
str.index as unicode_index = str.count
11690
11691
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11692
11693
Optional arguments start and end are interpreted as in slice notation.
11694
Raises ValueError when the substring is not found.
11695
[clinic start generated code]*/
11696
11697
static Py_ssize_t
11698
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11699
                   Py_ssize_t end)
11700
/*[clinic end generated code: output=77558288837cdf40 input=ae5e48f69ed75b06]*/
11701
45.0k
{
11702
45.0k
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11703
45.0k
    if (result == -1) {
11704
712
        PyErr_SetString(PyExc_ValueError, "substring not found");
11705
712
    }
11706
44.3k
    else if (result < 0) {
11707
0
        return -1;
11708
0
    }
11709
45.0k
    return result;
11710
45.0k
}
11711
11712
/*[clinic input]
11713
str.isascii as unicode_isascii
11714
11715
Return True if all characters in the string are ASCII, False otherwise.
11716
11717
ASCII characters have code points in the range U+0000-U+007F.
11718
Empty string is ASCII too.
11719
[clinic start generated code]*/
11720
11721
static PyObject *
11722
unicode_isascii_impl(PyObject *self)
11723
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11724
5.29k
{
11725
5.29k
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11726
5.29k
}
11727
11728
/*[clinic input]
11729
@permit_long_docstring_body
11730
str.islower as unicode_islower
11731
11732
Return True if the string is a lowercase string, False otherwise.
11733
11734
A string is lowercase if all cased characters in the string are lowercase and
11735
there is at least one cased character in the string.
11736
[clinic start generated code]*/
11737
11738
static PyObject *
11739
unicode_islower_impl(PyObject *self)
11740
/*[clinic end generated code: output=dbd41995bd005b81 input=c6fc0295241a1aaa]*/
11741
0
{
11742
0
    Py_ssize_t i, length;
11743
0
    int kind;
11744
0
    const void *data;
11745
0
    int cased;
11746
11747
0
    length = PyUnicode_GET_LENGTH(self);
11748
0
    kind = PyUnicode_KIND(self);
11749
0
    data = PyUnicode_DATA(self);
11750
11751
    /* Shortcut for single character strings */
11752
0
    if (length == 1)
11753
0
        return PyBool_FromLong(
11754
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11755
11756
    /* Special case for empty strings */
11757
0
    if (length == 0)
11758
0
        Py_RETURN_FALSE;
11759
11760
0
    cased = 0;
11761
0
    for (i = 0; i < length; i++) {
11762
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11763
11764
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11765
0
            Py_RETURN_FALSE;
11766
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
11767
0
            cased = 1;
11768
0
    }
11769
0
    return PyBool_FromLong(cased);
11770
0
}
11771
11772
/*[clinic input]
11773
@permit_long_docstring_body
11774
str.isupper as unicode_isupper
11775
11776
Return True if the string is an uppercase string, False otherwise.
11777
11778
A string is uppercase if all cased characters in the string are uppercase and
11779
there is at least one cased character in the string.
11780
[clinic start generated code]*/
11781
11782
static PyObject *
11783
unicode_isupper_impl(PyObject *self)
11784
/*[clinic end generated code: output=049209c8e7f15f59 input=8d5cb33e67efde72]*/
11785
10.7k
{
11786
10.7k
    Py_ssize_t i, length;
11787
10.7k
    int kind;
11788
10.7k
    const void *data;
11789
10.7k
    int cased;
11790
11791
10.7k
    length = PyUnicode_GET_LENGTH(self);
11792
10.7k
    kind = PyUnicode_KIND(self);
11793
10.7k
    data = PyUnicode_DATA(self);
11794
11795
    /* Shortcut for single character strings */
11796
10.7k
    if (length == 1)
11797
0
        return PyBool_FromLong(
11798
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11799
11800
    /* Special case for empty strings */
11801
10.7k
    if (length == 0)
11802
0
        Py_RETURN_FALSE;
11803
11804
10.7k
    cased = 0;
11805
135k
    for (i = 0; i < length; i++) {
11806
126k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11807
11808
126k
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11809
1.36k
            Py_RETURN_FALSE;
11810
124k
        else if (!cased && Py_UNICODE_ISUPPER(ch))
11811
9.56k
            cased = 1;
11812
126k
    }
11813
9.42k
    return PyBool_FromLong(cased);
11814
10.7k
}
11815
11816
/*[clinic input]
11817
str.istitle as unicode_istitle
11818
11819
Return True if the string is a title-cased string, False otherwise.
11820
11821
In a title-cased string, upper- and title-case characters may only
11822
follow uncased characters and lowercase characters only cased ones.
11823
[clinic start generated code]*/
11824
11825
static PyObject *
11826
unicode_istitle_impl(PyObject *self)
11827
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11828
0
{
11829
0
    Py_ssize_t i, length;
11830
0
    int kind;
11831
0
    const void *data;
11832
0
    int cased, previous_is_cased;
11833
11834
0
    length = PyUnicode_GET_LENGTH(self);
11835
0
    kind = PyUnicode_KIND(self);
11836
0
    data = PyUnicode_DATA(self);
11837
11838
    /* Shortcut for single character strings */
11839
0
    if (length == 1) {
11840
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11841
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11842
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
11843
0
    }
11844
11845
    /* Special case for empty strings */
11846
0
    if (length == 0)
11847
0
        Py_RETURN_FALSE;
11848
11849
0
    cased = 0;
11850
0
    previous_is_cased = 0;
11851
0
    for (i = 0; i < length; i++) {
11852
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11853
11854
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11855
0
            if (previous_is_cased)
11856
0
                Py_RETURN_FALSE;
11857
0
            previous_is_cased = 1;
11858
0
            cased = 1;
11859
0
        }
11860
0
        else if (Py_UNICODE_ISLOWER(ch)) {
11861
0
            if (!previous_is_cased)
11862
0
                Py_RETURN_FALSE;
11863
0
            previous_is_cased = 1;
11864
0
            cased = 1;
11865
0
        }
11866
0
        else
11867
0
            previous_is_cased = 0;
11868
0
    }
11869
0
    return PyBool_FromLong(cased);
11870
0
}
11871
11872
/*[clinic input]
11873
@permit_long_docstring_body
11874
str.isspace as unicode_isspace
11875
11876
Return True if the string is a whitespace string, False otherwise.
11877
11878
A string is whitespace if all characters in the string are whitespace and there
11879
is at least one character in the string.
11880
[clinic start generated code]*/
11881
11882
static PyObject *
11883
unicode_isspace_impl(PyObject *self)
11884
/*[clinic end generated code: output=163a63bfa08ac2b9 input=44fe05e248c6e159]*/
11885
1.34M
{
11886
1.34M
    Py_ssize_t i, length;
11887
1.34M
    int kind;
11888
1.34M
    const void *data;
11889
11890
1.34M
    length = PyUnicode_GET_LENGTH(self);
11891
1.34M
    kind = PyUnicode_KIND(self);
11892
1.34M
    data = PyUnicode_DATA(self);
11893
11894
    /* Shortcut for single character strings */
11895
1.34M
    if (length == 1)
11896
1.34M
        return PyBool_FromLong(
11897
1.34M
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11898
11899
    /* Special case for empty strings */
11900
1.33k
    if (length == 0)
11901
284
        Py_RETURN_FALSE;
11902
11903
7.60k
    for (i = 0; i < length; i++) {
11904
7.50k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11905
7.50k
        if (!Py_UNICODE_ISSPACE(ch))
11906
944
            Py_RETURN_FALSE;
11907
7.50k
    }
11908
1.04k
    Py_RETURN_TRUE;
11909
1.04k
}
11910
11911
/*[clinic input]
11912
@permit_long_docstring_body
11913
str.isalpha as unicode_isalpha
11914
11915
Return True if the string is an alphabetic string, False otherwise.
11916
11917
A string is alphabetic if all characters in the string are alphabetic and there
11918
is at least one character in the string.
11919
[clinic start generated code]*/
11920
11921
static PyObject *
11922
unicode_isalpha_impl(PyObject *self)
11923
/*[clinic end generated code: output=cc81b9ac3883ec4f input=c233000624a56e0d]*/
11924
19
{
11925
19
    Py_ssize_t i, length;
11926
19
    int kind;
11927
19
    const void *data;
11928
11929
19
    length = PyUnicode_GET_LENGTH(self);
11930
19
    kind = PyUnicode_KIND(self);
11931
19
    data = PyUnicode_DATA(self);
11932
11933
    /* Shortcut for single character strings */
11934
19
    if (length == 1)
11935
14
        return PyBool_FromLong(
11936
14
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11937
11938
    /* Special case for empty strings */
11939
5
    if (length == 0)
11940
0
        Py_RETURN_FALSE;
11941
11942
5
    for (i = 0; i < length; i++) {
11943
5
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11944
5
            Py_RETURN_FALSE;
11945
5
    }
11946
5
    Py_RETURN_TRUE;
11947
5
}
11948
11949
/*[clinic input]
11950
@permit_long_docstring_body
11951
str.isalnum as unicode_isalnum
11952
11953
Return True if the string is an alpha-numeric string, False otherwise.
11954
11955
A string is alpha-numeric if all characters in the string are alpha-numeric and
11956
there is at least one character in the string.
11957
[clinic start generated code]*/
11958
11959
static PyObject *
11960
unicode_isalnum_impl(PyObject *self)
11961
/*[clinic end generated code: output=a5a23490ffc3660c input=5d63ba9c9bafdb6b]*/
11962
0
{
11963
0
    int kind;
11964
0
    const void *data;
11965
0
    Py_ssize_t len, i;
11966
11967
0
    kind = PyUnicode_KIND(self);
11968
0
    data = PyUnicode_DATA(self);
11969
0
    len = PyUnicode_GET_LENGTH(self);
11970
11971
    /* Shortcut for single character strings */
11972
0
    if (len == 1) {
11973
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11974
0
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11975
0
    }
11976
11977
    /* Special case for empty strings */
11978
0
    if (len == 0)
11979
0
        Py_RETURN_FALSE;
11980
11981
0
    for (i = 0; i < len; i++) {
11982
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11983
0
        if (!Py_UNICODE_ISALNUM(ch))
11984
0
            Py_RETURN_FALSE;
11985
0
    }
11986
0
    Py_RETURN_TRUE;
11987
0
}
11988
11989
/*[clinic input]
11990
@permit_long_docstring_body
11991
str.isdecimal as unicode_isdecimal
11992
11993
Return True if the string is a decimal string, False otherwise.
11994
11995
A string is a decimal string if all characters in the string are decimal and
11996
there is at least one character in the string.
11997
[clinic start generated code]*/
11998
11999
static PyObject *
12000
unicode_isdecimal_impl(PyObject *self)
12001
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=8e84a58b414935a3]*/
12002
1.43k
{
12003
1.43k
    Py_ssize_t i, length;
12004
1.43k
    int kind;
12005
1.43k
    const void *data;
12006
12007
1.43k
    length = PyUnicode_GET_LENGTH(self);
12008
1.43k
    kind = PyUnicode_KIND(self);
12009
1.43k
    data = PyUnicode_DATA(self);
12010
12011
    /* Shortcut for single character strings */
12012
1.43k
    if (length == 1)
12013
214
        return PyBool_FromLong(
12014
214
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12015
12016
    /* Special case for empty strings */
12017
1.22k
    if (length == 0)
12018
0
        Py_RETURN_FALSE;
12019
12020
8.04k
    for (i = 0; i < length; i++) {
12021
7.45k
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12022
636
            Py_RETURN_FALSE;
12023
7.45k
    }
12024
1.22k
    Py_RETURN_TRUE;
12025
1.22k
}
12026
12027
/*[clinic input]
12028
@permit_long_docstring_body
12029
str.isdigit as unicode_isdigit
12030
12031
Return True if the string is a digit string, False otherwise.
12032
12033
A string is a digit string if all characters in the string are digits and there
12034
is at least one character in the string.
12035
[clinic start generated code]*/
12036
12037
static PyObject *
12038
unicode_isdigit_impl(PyObject *self)
12039
/*[clinic end generated code: output=10a6985311da6858 input=99e284affb54d4a0]*/
12040
1.08M
{
12041
1.08M
    Py_ssize_t i, length;
12042
1.08M
    int kind;
12043
1.08M
    const void *data;
12044
12045
1.08M
    length = PyUnicode_GET_LENGTH(self);
12046
1.08M
    kind = PyUnicode_KIND(self);
12047
1.08M
    data = PyUnicode_DATA(self);
12048
12049
    /* Shortcut for single character strings */
12050
1.08M
    if (length == 1) {
12051
1.08M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12052
1.08M
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12053
1.08M
    }
12054
12055
    /* Special case for empty strings */
12056
408
    if (length == 0)
12057
0
        Py_RETURN_FALSE;
12058
12059
1.45k
    for (i = 0; i < length; i++) {
12060
1.04k
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12061
0
            Py_RETURN_FALSE;
12062
1.04k
    }
12063
408
    Py_RETURN_TRUE;
12064
408
}
12065
12066
/*[clinic input]
12067
@permit_long_docstring_body
12068
str.isnumeric as unicode_isnumeric
12069
12070
Return True if the string is a numeric string, False otherwise.
12071
12072
A string is numeric if all characters in the string are numeric and there is at
12073
least one character in the string.
12074
[clinic start generated code]*/
12075
12076
static PyObject *
12077
unicode_isnumeric_impl(PyObject *self)
12078
/*[clinic end generated code: output=9172a32d9013051a input=e9f5b6b8b29b0ee6]*/
12079
0
{
12080
0
    Py_ssize_t i, length;
12081
0
    int kind;
12082
0
    const void *data;
12083
12084
0
    length = PyUnicode_GET_LENGTH(self);
12085
0
    kind = PyUnicode_KIND(self);
12086
0
    data = PyUnicode_DATA(self);
12087
12088
    /* Shortcut for single character strings */
12089
0
    if (length == 1)
12090
0
        return PyBool_FromLong(
12091
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12092
12093
    /* Special case for empty strings */
12094
0
    if (length == 0)
12095
0
        Py_RETURN_FALSE;
12096
12097
0
    for (i = 0; i < length; i++) {
12098
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12099
0
            Py_RETURN_FALSE;
12100
0
    }
12101
0
    Py_RETURN_TRUE;
12102
0
}
12103
12104
Py_ssize_t
12105
_PyUnicode_ScanIdentifier(PyObject *self)
12106
62.5k
{
12107
62.5k
    Py_ssize_t i;
12108
62.5k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12109
62.5k
    if (len == 0) {
12110
        /* an empty string is not a valid identifier */
12111
0
        return 0;
12112
0
    }
12113
12114
62.5k
    int kind = PyUnicode_KIND(self);
12115
62.5k
    const void *data = PyUnicode_DATA(self);
12116
62.5k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12117
    /* PEP 3131 says that the first character must be in
12118
       XID_Start and subsequent characters in XID_Continue,
12119
       and for the ASCII range, the 2.x rules apply (i.e
12120
       start with letters and underscore, continue with
12121
       letters, digits, underscore). However, given the current
12122
       definition of XID_Start and XID_Continue, it is sufficient
12123
       to check just for these, except that _ must be allowed
12124
       as starting an identifier.  */
12125
62.5k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12126
793
        return 0;
12127
793
    }
12128
12129
522k
    for (i = 1; i < len; i++) {
12130
461k
        ch = PyUnicode_READ(kind, data, i);
12131
461k
        if (!_PyUnicode_IsXidContinue(ch)) {
12132
359
            return i;
12133
359
        }
12134
461k
    }
12135
61.3k
    return i;
12136
61.7k
}
12137
12138
int
12139
PyUnicode_IsIdentifier(PyObject *self)
12140
51.4k
{
12141
51.4k
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12142
51.4k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12143
    /* an empty string is not a valid identifier */
12144
51.4k
    return len && i == len;
12145
51.4k
}
12146
12147
/*[clinic input]
12148
@permit_long_docstring_body
12149
str.isidentifier as unicode_isidentifier
12150
12151
Return True if the string is a valid Python identifier, False otherwise.
12152
12153
Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12154
such as "def" or "class".
12155
[clinic start generated code]*/
12156
12157
static PyObject *
12158
unicode_isidentifier_impl(PyObject *self)
12159
/*[clinic end generated code: output=fe585a9666572905 input=86315dd889d7bd04]*/
12160
49.0k
{
12161
49.0k
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12162
49.0k
}
12163
12164
/*[clinic input]
12165
@permit_long_summary
12166
str.isprintable as unicode_isprintable
12167
12168
Return True if all characters in the string are printable, False otherwise.
12169
12170
A character is printable if repr() may use it in its output.
12171
[clinic start generated code]*/
12172
12173
static PyObject *
12174
unicode_isprintable_impl(PyObject *self)
12175
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=18345ba847084ec5]*/
12176
1.81M
{
12177
1.81M
    Py_ssize_t i, length;
12178
1.81M
    int kind;
12179
1.81M
    const void *data;
12180
12181
1.81M
    length = PyUnicode_GET_LENGTH(self);
12182
1.81M
    kind = PyUnicode_KIND(self);
12183
1.81M
    data = PyUnicode_DATA(self);
12184
12185
    /* Shortcut for single character strings */
12186
1.81M
    if (length == 1)
12187
1.81M
        return PyBool_FromLong(
12188
1.81M
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12189
12190
0
    for (i = 0; i < length; i++) {
12191
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12192
0
            Py_RETURN_FALSE;
12193
0
        }
12194
0
    }
12195
0
    Py_RETURN_TRUE;
12196
0
}
12197
12198
/*[clinic input]
12199
@permit_long_docstring_body
12200
str.join as unicode_join
12201
12202
    iterable: object
12203
    /
12204
12205
Concatenate any number of strings.
12206
12207
The string whose method is called is inserted in between each given string.
12208
The result is returned as a new string.
12209
12210
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12211
[clinic start generated code]*/
12212
12213
static PyObject *
12214
unicode_join(PyObject *self, PyObject *iterable)
12215
/*[clinic end generated code: output=6857e7cecfe7bf98 input=bac724ed412ef3f8]*/
12216
18.6M
{
12217
18.6M
    return PyUnicode_Join(self, iterable);
12218
18.6M
}
12219
12220
static Py_ssize_t
12221
unicode_length(PyObject *self)
12222
32.0M
{
12223
32.0M
    return PyUnicode_GET_LENGTH(self);
12224
32.0M
}
12225
12226
/*[clinic input]
12227
str.ljust as unicode_ljust
12228
12229
    width: Py_ssize_t
12230
    fillchar: Py_UCS4 = ' '
12231
    /
12232
12233
Return a left-justified string of length width.
12234
12235
Padding is done using the specified fill character (default is a space).
12236
[clinic start generated code]*/
12237
12238
static PyObject *
12239
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12240
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12241
130
{
12242
130
    if (PyUnicode_GET_LENGTH(self) >= width)
12243
62
        return unicode_result_unchanged(self);
12244
12245
68
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12246
130
}
12247
12248
/*[clinic input]
12249
str.lower as unicode_lower
12250
12251
Return a copy of the string converted to lowercase.
12252
[clinic start generated code]*/
12253
12254
static PyObject *
12255
unicode_lower_impl(PyObject *self)
12256
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12257
72.6M
{
12258
72.6M
    if (PyUnicode_IS_ASCII(self))
12259
66.5M
        return ascii_upper_or_lower(self, 1);
12260
6.10M
    return case_operation(self, do_lower);
12261
72.6M
}
12262
12263
65.1M
#define LEFTSTRIP 0
12264
80.1M
#define RIGHTSTRIP 1
12265
44.1M
#define BOTHSTRIP 2
12266
12267
/* Arrays indexed by above */
12268
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12269
12270
0
#define STRIPNAME(i) (stripfuncnames[i])
12271
12272
/* externally visible for str.strip(unicode) */
12273
PyObject *
12274
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12275
4.45M
{
12276
4.45M
    const void *data;
12277
4.45M
    int kind;
12278
4.45M
    Py_ssize_t i, j, len;
12279
4.45M
    BLOOM_MASK sepmask;
12280
4.45M
    Py_ssize_t seplen;
12281
12282
4.45M
    kind = PyUnicode_KIND(self);
12283
4.45M
    data = PyUnicode_DATA(self);
12284
4.45M
    len = PyUnicode_GET_LENGTH(self);
12285
4.45M
    seplen = PyUnicode_GET_LENGTH(sepobj);
12286
4.45M
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12287
4.45M
                              PyUnicode_DATA(sepobj),
12288
4.45M
                              seplen);
12289
12290
4.45M
    i = 0;
12291
4.45M
    if (striptype != RIGHTSTRIP) {
12292
454k
        while (i < len) {
12293
451k
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12294
451k
            if (!BLOOM(sepmask, ch))
12295
416k
                break;
12296
35.2k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12297
2.58k
                break;
12298
32.7k
            i++;
12299
32.7k
        }
12300
421k
    }
12301
12302
4.45M
    j = len;
12303
4.45M
    if (striptype != LEFTSTRIP) {
12304
4.03M
        j--;
12305
4.69M
        while (j >= i) {
12306
3.48M
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12307
3.48M
            if (!BLOOM(sepmask, ch))
12308
2.69M
                break;
12309
785k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12310
119k
                break;
12311
666k
            j--;
12312
666k
        }
12313
12314
4.03M
        j++;
12315
4.03M
    }
12316
12317
4.45M
    return PyUnicode_Substring(self, i, j);
12318
4.45M
}
12319
12320
PyObject*
12321
_PyUnicode_BinarySlice(PyObject *container, PyObject *start_o, PyObject *stop_o)
12322
31.0M
{
12323
31.0M
    assert(PyUnicode_CheckExact(container));
12324
31.0M
    Py_ssize_t len = PyUnicode_GET_LENGTH(container);
12325
31.0M
    Py_ssize_t istart, istop;
12326
31.0M
    if (!_PyEval_UnpackIndices(start_o, stop_o, len, &istart, &istop)) {
12327
0
        return NULL;
12328
0
    }
12329
31.0M
    return PyUnicode_Substring(container, istart, istop);
12330
31.0M
}
12331
12332
PyObject*
12333
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12334
255M
{
12335
255M
    const unsigned char *data;
12336
255M
    int kind;
12337
255M
    Py_ssize_t length;
12338
12339
255M
    length = PyUnicode_GET_LENGTH(self);
12340
255M
    end = Py_MIN(end, length);
12341
12342
255M
    if (start == 0 && end == length)
12343
67.0M
        return unicode_result_unchanged(self);
12344
12345
188M
    if (start < 0 || end < 0) {
12346
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12347
0
        return NULL;
12348
0
    }
12349
188M
    if (start >= length || end < start)
12350
5.35M
        _Py_RETURN_UNICODE_EMPTY();
12351
12352
182M
    length = end - start;
12353
182M
    if (PyUnicode_IS_ASCII(self)) {
12354
60.4M
        data = PyUnicode_1BYTE_DATA(self);
12355
60.4M
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12356
60.4M
    }
12357
122M
    else {
12358
122M
        kind = PyUnicode_KIND(self);
12359
122M
        data = PyUnicode_1BYTE_DATA(self);
12360
122M
        return PyUnicode_FromKindAndData(kind,
12361
122M
                                         data + kind * start,
12362
122M
                                         length);
12363
122M
    }
12364
182M
}
12365
12366
static PyObject *
12367
do_strip(PyObject *self, int striptype)
12368
58.7M
{
12369
58.7M
    Py_ssize_t len, i, j;
12370
12371
58.7M
    len = PyUnicode_GET_LENGTH(self);
12372
12373
58.7M
    if (PyUnicode_IS_ASCII(self)) {
12374
46.1M
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12375
12376
46.1M
        i = 0;
12377
46.1M
        if (striptype != RIGHTSTRIP) {
12378
41.0M
            while (i < len) {
12379
33.5M
                Py_UCS1 ch = data[i];
12380
33.5M
                if (!_Py_ascii_whitespace[ch])
12381
28.6M
                    break;
12382
4.91M
                i++;
12383
4.91M
            }
12384
36.1M
        }
12385
12386
46.1M
        j = len;
12387
46.1M
        if (striptype != LEFTSTRIP) {
12388
45.7M
            j--;
12389
50.9M
            while (j >= i) {
12390
38.1M
                Py_UCS1 ch = data[j];
12391
38.1M
                if (!_Py_ascii_whitespace[ch])
12392
32.9M
                    break;
12393
5.21M
                j--;
12394
5.21M
            }
12395
45.7M
            j++;
12396
45.7M
        }
12397
46.1M
    }
12398
12.6M
    else {
12399
12.6M
        int kind = PyUnicode_KIND(self);
12400
12.6M
        const void *data = PyUnicode_DATA(self);
12401
12402
12.6M
        i = 0;
12403
12.6M
        if (striptype != RIGHTSTRIP) {
12404
11.5M
            while (i < len) {
12405
11.5M
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12406
11.5M
                if (!Py_UNICODE_ISSPACE(ch))
12407
9.62M
                    break;
12408
1.91M
                i++;
12409
1.91M
            }
12410
9.62M
        }
12411
12412
12.6M
        j = len;
12413
12.6M
        if (striptype != LEFTSTRIP) {
12414
11.3M
            j--;
12415
14.3M
            while (j >= i) {
12416
14.2M
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12417
14.2M
                if (!Py_UNICODE_ISSPACE(ch))
12418
11.3M
                    break;
12419
2.92M
                j--;
12420
2.92M
            }
12421
11.3M
            j++;
12422
11.3M
        }
12423
12.6M
    }
12424
12425
58.7M
    return PyUnicode_Substring(self, i, j);
12426
58.7M
}
12427
12428
12429
static PyObject *
12430
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12431
63.1M
{
12432
63.1M
    if (sep != Py_None) {
12433
4.45M
        if (PyUnicode_Check(sep))
12434
4.45M
            return _PyUnicode_XStrip(self, striptype, sep);
12435
0
        else {
12436
0
            PyErr_Format(PyExc_TypeError,
12437
0
                         "%s arg must be None or str",
12438
0
                         STRIPNAME(striptype));
12439
0
            return NULL;
12440
0
        }
12441
4.45M
    }
12442
12443
58.7M
    return do_strip(self, striptype);
12444
63.1M
}
12445
12446
12447
/*[clinic input]
12448
@permit_long_summary
12449
str.strip as unicode_strip
12450
12451
    chars: object = None
12452
    /
12453
12454
Return a copy of the string with leading and trailing whitespace removed.
12455
12456
If chars is given and not None, remove characters in chars instead.
12457
[clinic start generated code]*/
12458
12459
static PyObject *
12460
unicode_strip_impl(PyObject *self, PyObject *chars)
12461
/*[clinic end generated code: output=ca19018454345d57 input=8bc6353450345fbd]*/
12462
44.1M
{
12463
44.1M
    return do_argstrip(self, BOTHSTRIP, chars);
12464
44.1M
}
12465
12466
12467
/*[clinic input]
12468
str.lstrip as unicode_lstrip
12469
12470
    chars: object = None
12471
    /
12472
12473
Return a copy of the string with leading whitespace removed.
12474
12475
If chars is given and not None, remove characters in chars instead.
12476
[clinic start generated code]*/
12477
12478
static PyObject *
12479
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12480
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12481
2.01M
{
12482
2.01M
    return do_argstrip(self, LEFTSTRIP, chars);
12483
2.01M
}
12484
12485
12486
/*[clinic input]
12487
str.rstrip as unicode_rstrip
12488
12489
    chars: object = None
12490
    /
12491
12492
Return a copy of the string with trailing whitespace removed.
12493
12494
If chars is given and not None, remove characters in chars instead.
12495
[clinic start generated code]*/
12496
12497
static PyObject *
12498
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12499
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12500
16.9M
{
12501
16.9M
    return do_argstrip(self, RIGHTSTRIP, chars);
12502
16.9M
}
12503
12504
12505
PyObject *
12506
_PyUnicode_Repeat(PyObject *str, Py_ssize_t len)
12507
344k
{
12508
344k
    PyObject *u;
12509
344k
    Py_ssize_t nchars, n;
12510
12511
344k
    if (len < 1)
12512
32.4k
        _Py_RETURN_UNICODE_EMPTY();
12513
12514
    /* no repeat, return original string */
12515
312k
    if (len == 1)
12516
29.7k
        return unicode_result_unchanged(str);
12517
12518
282k
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12519
0
        PyErr_SetString(PyExc_OverflowError,
12520
0
                        "repeated string is too long");
12521
0
        return NULL;
12522
0
    }
12523
282k
    nchars = len * PyUnicode_GET_LENGTH(str);
12524
12525
282k
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12526
282k
    if (!u)
12527
0
        return NULL;
12528
282k
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12529
12530
282k
    if (PyUnicode_GET_LENGTH(str) == 1) {
12531
280k
        int kind = PyUnicode_KIND(str);
12532
280k
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12533
280k
        if (kind == PyUnicode_1BYTE_KIND) {
12534
280k
            void *to = PyUnicode_DATA(u);
12535
280k
            memset(to, (unsigned char)fill_char, len);
12536
280k
        }
12537
0
        else if (kind == PyUnicode_2BYTE_KIND) {
12538
0
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12539
0
            for (n = 0; n < len; ++n)
12540
0
                ucs2[n] = fill_char;
12541
0
        } else {
12542
0
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12543
0
            assert(kind == PyUnicode_4BYTE_KIND);
12544
0
            for (n = 0; n < len; ++n)
12545
0
                ucs4[n] = fill_char;
12546
0
        }
12547
280k
    }
12548
2.28k
    else {
12549
2.28k
        Py_ssize_t char_size = PyUnicode_KIND(str);
12550
2.28k
        char *to = (char *) PyUnicode_DATA(u);
12551
2.28k
        _PyBytes_RepeatBuffer(to, nchars * char_size, PyUnicode_DATA(str),
12552
2.28k
            PyUnicode_GET_LENGTH(str) * char_size);
12553
2.28k
    }
12554
12555
282k
    assert(_PyUnicode_CheckConsistency(u, 1));
12556
282k
    return u;
12557
282k
}
12558
12559
PyObject *
12560
PyUnicode_Replace(PyObject *str,
12561
                  PyObject *substr,
12562
                  PyObject *replstr,
12563
                  Py_ssize_t maxcount)
12564
0
{
12565
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12566
0
            ensure_unicode(replstr) < 0)
12567
0
        return NULL;
12568
0
    return replace(str, substr, replstr, maxcount);
12569
0
}
12570
12571
/*[clinic input]
12572
str.replace as unicode_replace
12573
12574
    old: unicode
12575
    new: unicode
12576
    /
12577
    count: Py_ssize_t = -1
12578
        Maximum number of occurrences to replace.
12579
        -1 (the default value) means replace all occurrences.
12580
12581
Return a copy with all occurrences of substring old replaced by new.
12582
12583
If count is given, only the first count occurrences are replaced.
12584
If count is not specified or -1, then all occurrences are replaced.
12585
[clinic start generated code]*/
12586
12587
static PyObject *
12588
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12589
                     Py_ssize_t count)
12590
/*[clinic end generated code: output=b63f1a8b5eebf448 input=d15a6886b05e2edc]*/
12591
20.2M
{
12592
20.2M
    return replace(self, old, new, count);
12593
20.2M
}
12594
12595
/*[clinic input]
12596
@permit_long_docstring_body
12597
str.removeprefix as unicode_removeprefix
12598
12599
    prefix: unicode
12600
    /
12601
12602
Return a str with the given prefix string removed if present.
12603
12604
If the string starts with the prefix string, return string[len(prefix):].
12605
Otherwise, return a copy of the original string.
12606
[clinic start generated code]*/
12607
12608
static PyObject *
12609
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12610
/*[clinic end generated code: output=f1e5945e9763bcb9 input=1989a856dbb813f1]*/
12611
28
{
12612
28
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12613
28
    if (match == -1) {
12614
0
        return NULL;
12615
0
    }
12616
28
    if (match) {
12617
28
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12618
28
                                   PyUnicode_GET_LENGTH(self));
12619
28
    }
12620
0
    return unicode_result_unchanged(self);
12621
28
}
12622
12623
/*[clinic input]
12624
str.removesuffix as unicode_removesuffix
12625
12626
    suffix: unicode
12627
    /
12628
12629
Return a str with the given suffix string removed if present.
12630
12631
If the string ends with the suffix string and that suffix is not empty,
12632
return string[:-len(suffix)]. Otherwise, return a copy of the original
12633
string.
12634
[clinic start generated code]*/
12635
12636
static PyObject *
12637
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12638
/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12639
0
{
12640
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12641
0
    if (match == -1) {
12642
0
        return NULL;
12643
0
    }
12644
0
    if (match) {
12645
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12646
0
                                            - PyUnicode_GET_LENGTH(suffix));
12647
0
    }
12648
0
    return unicode_result_unchanged(self);
12649
0
}
12650
12651
static PyObject *
12652
unicode_repr(PyObject *unicode)
12653
12.4M
{
12654
12.4M
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12655
12.4M
    const void *idata = PyUnicode_DATA(unicode);
12656
12657
    /* Compute length of output, quote characters, and
12658
       maximum character */
12659
12.4M
    Py_ssize_t osize = 0;
12660
12.4M
    Py_UCS4 maxch = 127;
12661
12.4M
    Py_ssize_t squote = 0;
12662
12.4M
    Py_ssize_t dquote = 0;
12663
12.4M
    int ikind = PyUnicode_KIND(unicode);
12664
281M
    for (Py_ssize_t i = 0; i < isize; i++) {
12665
268M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12666
268M
        Py_ssize_t incr = 1;
12667
268M
        switch (ch) {
12668
186k
        case '\'': squote++; break;
12669
506k
        case '"':  dquote++; break;
12670
4.46M
        case '\\': case '\t': case '\r': case '\n':
12671
4.46M
            incr = 2;
12672
4.46M
            break;
12673
263M
        default:
12674
            /* Fast-path ASCII */
12675
263M
            if (ch < ' ' || ch == 0x7f)
12676
156M
                incr = 4; /* \xHH */
12677
106M
            else if (ch < 0x7f)
12678
95.7M
                ;
12679
11.0M
            else if (Py_UNICODE_ISPRINTABLE(ch))
12680
10.7M
                maxch = (ch > maxch) ? ch : maxch;
12681
323k
            else if (ch < 0x100)
12682
71.4k
                incr = 4; /* \xHH */
12683
251k
            else if (ch < 0x10000)
12684
90.3k
                incr = 6; /* \uHHHH */
12685
161k
            else
12686
161k
                incr = 10; /* \uHHHHHHHH */
12687
268M
        }
12688
268M
        if (osize > PY_SSIZE_T_MAX - incr) {
12689
0
            PyErr_SetString(PyExc_OverflowError,
12690
0
                            "string is too long to generate repr");
12691
0
            return NULL;
12692
0
        }
12693
268M
        osize += incr;
12694
268M
    }
12695
12696
12.4M
    Py_UCS4 quote = '\'';
12697
12.4M
    int changed = (osize != isize);
12698
12.4M
    if (squote) {
12699
95.5k
        changed = 1;
12700
95.5k
        if (dquote)
12701
            /* Both squote and dquote present. Use squote,
12702
               and escape them */
12703
7.31k
            osize += squote;
12704
88.2k
        else
12705
88.2k
            quote = '"';
12706
95.5k
    }
12707
12.4M
    osize += 2;   /* quotes */
12708
12709
12.4M
    PyObject *repr = PyUnicode_New(osize, maxch);
12710
12.4M
    if (repr == NULL)
12711
0
        return NULL;
12712
12.4M
    int okind = PyUnicode_KIND(repr);
12713
12.4M
    void *odata = PyUnicode_DATA(repr);
12714
12715
12.4M
    if (!changed) {
12716
6.52M
        PyUnicode_WRITE(okind, odata, 0, quote);
12717
12718
6.52M
        _PyUnicode_FastCopyCharacters(repr, 1,
12719
6.52M
                                      unicode, 0,
12720
6.52M
                                      isize);
12721
12722
6.52M
        PyUnicode_WRITE(okind, odata, osize-1, quote);
12723
6.52M
    }
12724
5.92M
    else {
12725
5.92M
        switch (okind) {
12726
5.67M
        case PyUnicode_1BYTE_KIND:
12727
5.67M
            ucs1lib_repr(unicode, quote, odata);
12728
5.67M
            break;
12729
240k
        case PyUnicode_2BYTE_KIND:
12730
240k
            ucs2lib_repr(unicode, quote, odata);
12731
240k
            break;
12732
8.16k
        default:
12733
8.16k
            assert(okind == PyUnicode_4BYTE_KIND);
12734
8.16k
            ucs4lib_repr(unicode, quote, odata);
12735
5.92M
        }
12736
5.92M
    }
12737
12738
12.4M
    assert(_PyUnicode_CheckConsistency(repr, 1));
12739
12.4M
    return repr;
12740
12.4M
}
12741
12742
/*[clinic input]
12743
@permit_long_summary
12744
str.rfind as unicode_rfind = str.count
12745
12746
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12747
12748
Optional arguments start and end are interpreted as in slice notation.
12749
Return -1 on failure.
12750
[clinic start generated code]*/
12751
12752
static Py_ssize_t
12753
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12754
                   Py_ssize_t end)
12755
/*[clinic end generated code: output=880b29f01dd014c8 input=7f7e97d5cd3299a2]*/
12756
267k
{
12757
267k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12758
267k
    if (result < 0) {
12759
10.8k
        return -1;
12760
10.8k
    }
12761
256k
    return result;
12762
267k
}
12763
12764
/*[clinic input]
12765
@permit_long_summary
12766
str.rindex as unicode_rindex = str.count
12767
12768
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12769
12770
Optional arguments start and end are interpreted as in slice notation.
12771
Raises ValueError when the substring is not found.
12772
[clinic start generated code]*/
12773
12774
static Py_ssize_t
12775
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12776
                    Py_ssize_t end)
12777
/*[clinic end generated code: output=5f3aef124c867fe1 input=0363a324740b3e62]*/
12778
165k
{
12779
165k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12780
165k
    if (result == -1) {
12781
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12782
0
    }
12783
165k
    else if (result < 0) {
12784
0
        return -1;
12785
0
    }
12786
165k
    return result;
12787
165k
}
12788
12789
/*[clinic input]
12790
str.rjust as unicode_rjust
12791
12792
    width: Py_ssize_t
12793
    fillchar: Py_UCS4 = ' '
12794
    /
12795
12796
Return a right-justified string of length width.
12797
12798
Padding is done using the specified fill character (default is a space).
12799
[clinic start generated code]*/
12800
12801
static PyObject *
12802
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12803
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12804
0
{
12805
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12806
0
        return unicode_result_unchanged(self);
12807
12808
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12809
0
}
12810
12811
PyObject *
12812
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12813
0
{
12814
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12815
0
        return NULL;
12816
12817
0
    return split(s, sep, maxsplit);
12818
0
}
12819
12820
/*[clinic input]
12821
@permit_long_summary
12822
str.split as unicode_split
12823
12824
    sep: object = None
12825
        The separator used to split the string.
12826
12827
        When set to None (the default value), will split on any whitespace
12828
        character (including \n \r \t \f and spaces) and will discard
12829
        empty strings from the result.
12830
    maxsplit: Py_ssize_t = -1
12831
        Maximum number of splits.
12832
        -1 (the default value) means no limit.
12833
12834
Return a list of the substrings in the string, using sep as the separator string.
12835
12836
Splitting starts at the front of the string and works to the end.
12837
12838
Note, str.split() is mainly useful for data that has been intentionally
12839
delimited.  With natural text that includes punctuation, consider using
12840
the regular expression module.
12841
12842
[clinic start generated code]*/
12843
12844
static PyObject *
12845
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12846
/*[clinic end generated code: output=3a65b1db356948dc input=2c1fd08a78e038b8]*/
12847
20.8M
{
12848
20.8M
    if (sep == Py_None)
12849
164k
        return split(self, NULL, maxsplit);
12850
20.6M
    if (PyUnicode_Check(sep))
12851
20.6M
        return split(self, sep, maxsplit);
12852
12853
0
    PyErr_Format(PyExc_TypeError,
12854
0
                 "must be str or None, not %.100s",
12855
0
                 Py_TYPE(sep)->tp_name);
12856
0
    return NULL;
12857
20.6M
}
12858
12859
PyObject *
12860
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12861
8.25M
{
12862
8.25M
    PyObject* out;
12863
8.25M
    int kind1, kind2;
12864
8.25M
    const void *buf1, *buf2;
12865
8.25M
    Py_ssize_t len1, len2;
12866
12867
8.25M
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12868
0
        return NULL;
12869
12870
8.25M
    kind1 = PyUnicode_KIND(str_obj);
12871
8.25M
    kind2 = PyUnicode_KIND(sep_obj);
12872
8.25M
    len1 = PyUnicode_GET_LENGTH(str_obj);
12873
8.25M
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12874
8.25M
    if (kind1 < kind2 || len1 < len2) {
12875
1.31k
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12876
1.31k
        return PyTuple_Pack(3, str_obj, empty, empty);
12877
1.31k
    }
12878
8.25M
    buf1 = PyUnicode_DATA(str_obj);
12879
8.25M
    buf2 = PyUnicode_DATA(sep_obj);
12880
8.25M
    if (kind2 != kind1) {
12881
84.9k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12882
84.9k
        if (!buf2)
12883
0
            return NULL;
12884
84.9k
    }
12885
12886
8.25M
    switch (kind1) {
12887
8.17M
    case PyUnicode_1BYTE_KIND:
12888
8.17M
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12889
2.72M
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12890
5.44M
        else
12891
5.44M
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12892
8.17M
        break;
12893
73.1k
    case PyUnicode_2BYTE_KIND:
12894
73.1k
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12895
73.1k
        break;
12896
11.7k
    case PyUnicode_4BYTE_KIND:
12897
11.7k
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12898
11.7k
        break;
12899
0
    default:
12900
0
        Py_UNREACHABLE();
12901
8.25M
    }
12902
12903
8.25M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12904
8.25M
    if (kind2 != kind1)
12905
84.9k
        PyMem_Free((void *)buf2);
12906
12907
8.25M
    return out;
12908
8.25M
}
12909
12910
12911
PyObject *
12912
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12913
56.6k
{
12914
56.6k
    PyObject* out;
12915
56.6k
    int kind1, kind2;
12916
56.6k
    const void *buf1, *buf2;
12917
56.6k
    Py_ssize_t len1, len2;
12918
12919
56.6k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12920
0
        return NULL;
12921
12922
56.6k
    kind1 = PyUnicode_KIND(str_obj);
12923
56.6k
    kind2 = PyUnicode_KIND(sep_obj);
12924
56.6k
    len1 = PyUnicode_GET_LENGTH(str_obj);
12925
56.6k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12926
56.6k
    if (kind1 < kind2 || len1 < len2) {
12927
0
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12928
0
        return PyTuple_Pack(3, empty, empty, str_obj);
12929
0
    }
12930
56.6k
    buf1 = PyUnicode_DATA(str_obj);
12931
56.6k
    buf2 = PyUnicode_DATA(sep_obj);
12932
56.6k
    if (kind2 != kind1) {
12933
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12934
0
        if (!buf2)
12935
0
            return NULL;
12936
0
    }
12937
12938
56.6k
    switch (kind1) {
12939
56.6k
    case PyUnicode_1BYTE_KIND:
12940
56.6k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12941
56.6k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12942
0
        else
12943
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12944
56.6k
        break;
12945
0
    case PyUnicode_2BYTE_KIND:
12946
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12947
0
        break;
12948
0
    case PyUnicode_4BYTE_KIND:
12949
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12950
0
        break;
12951
0
    default:
12952
0
        Py_UNREACHABLE();
12953
56.6k
    }
12954
12955
56.6k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12956
56.6k
    if (kind2 != kind1)
12957
0
        PyMem_Free((void *)buf2);
12958
12959
56.6k
    return out;
12960
56.6k
}
12961
12962
/*[clinic input]
12963
@permit_long_docstring_body
12964
str.partition as unicode_partition
12965
12966
    sep: object
12967
    /
12968
12969
Partition the string into three parts using the given separator.
12970
12971
This will search for the separator in the string.  If the separator is found,
12972
returns a 3-tuple containing the part before the separator, the separator
12973
itself, and the part after it.
12974
12975
If the separator is not found, returns a 3-tuple containing the original string
12976
and two empty strings.
12977
[clinic start generated code]*/
12978
12979
static PyObject *
12980
unicode_partition(PyObject *self, PyObject *sep)
12981
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=4d854b520d7b0e97]*/
12982
8.25M
{
12983
8.25M
    return PyUnicode_Partition(self, sep);
12984
8.25M
}
12985
12986
/*[clinic input]
12987
@permit_long_docstring_body
12988
str.rpartition as unicode_rpartition = str.partition
12989
12990
Partition the string into three parts using the given separator.
12991
12992
This will search for the separator in the string, starting at the end. If
12993
the separator is found, returns a 3-tuple containing the part before the
12994
separator, the separator itself, and the part after it.
12995
12996
If the separator is not found, returns a 3-tuple containing two empty strings
12997
and the original string.
12998
[clinic start generated code]*/
12999
13000
static PyObject *
13001
unicode_rpartition(PyObject *self, PyObject *sep)
13002
/*[clinic end generated code: output=1aa13cf1156572aa input=a6adabe91e75b486]*/
13003
56.6k
{
13004
56.6k
    return PyUnicode_RPartition(self, sep);
13005
56.6k
}
13006
13007
PyObject *
13008
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13009
0
{
13010
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13011
0
        return NULL;
13012
13013
0
    return rsplit(s, sep, maxsplit);
13014
0
}
13015
13016
/*[clinic input]
13017
@permit_long_summary
13018
str.rsplit as unicode_rsplit = str.split
13019
13020
Return a list of the substrings in the string, using sep as the separator string.
13021
13022
Splitting starts at the end of the string and works to the front.
13023
[clinic start generated code]*/
13024
13025
static PyObject *
13026
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13027
/*[clinic end generated code: output=c2b815c63bcabffc input=0f762e30d267fa83]*/
13028
66
{
13029
66
    if (sep == Py_None)
13030
0
        return rsplit(self, NULL, maxsplit);
13031
66
    if (PyUnicode_Check(sep))
13032
66
        return rsplit(self, sep, maxsplit);
13033
13034
0
    PyErr_Format(PyExc_TypeError,
13035
0
                 "must be str or None, not %.100s",
13036
0
                 Py_TYPE(sep)->tp_name);
13037
0
    return NULL;
13038
66
}
13039
13040
/*[clinic input]
13041
@permit_long_docstring_body
13042
str.splitlines as unicode_splitlines
13043
13044
    keepends: bool = False
13045
13046
Return a list of the lines in the string, breaking at line boundaries.
13047
13048
Line breaks are not included in the resulting list unless keepends is given and
13049
true.
13050
[clinic start generated code]*/
13051
13052
static PyObject *
13053
unicode_splitlines_impl(PyObject *self, int keepends)
13054
/*[clinic end generated code: output=f664dcdad153ec40 input=39eeafbfef61c827]*/
13055
18.1k
{
13056
18.1k
    return PyUnicode_Splitlines(self, keepends);
13057
18.1k
}
13058
13059
static
13060
PyObject *unicode_str(PyObject *self)
13061
2.93M
{
13062
2.93M
    return unicode_result_unchanged(self);
13063
2.93M
}
13064
13065
/*[clinic input]
13066
@permit_long_summary
13067
str.swapcase as unicode_swapcase
13068
13069
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13070
[clinic start generated code]*/
13071
13072
static PyObject *
13073
unicode_swapcase_impl(PyObject *self)
13074
/*[clinic end generated code: output=5d28966bf6d7b2af input=85bc39a9b4e8ee91]*/
13075
0
{
13076
0
    return case_operation(self, do_swapcase);
13077
0
}
13078
13079
static int
13080
unicode_maketrans_from_dict(PyObject *x, PyObject *newdict)
13081
0
{
13082
0
    PyObject *key, *value;
13083
0
    Py_ssize_t i = 0;
13084
0
    int res;
13085
0
    while (PyDict_Next(x, &i, &key, &value)) {
13086
0
        if (PyUnicode_Check(key)) {
13087
0
            PyObject *newkey;
13088
0
            int kind;
13089
0
            const void *data;
13090
0
            if (PyUnicode_GET_LENGTH(key) != 1) {
13091
0
                PyErr_SetString(PyExc_ValueError, "string keys in translate"
13092
0
                                "table must be of length 1");
13093
0
                return -1;
13094
0
            }
13095
0
            kind = PyUnicode_KIND(key);
13096
0
            data = PyUnicode_DATA(key);
13097
0
            newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13098
0
            if (!newkey)
13099
0
                return -1;
13100
0
            res = PyDict_SetItem(newdict, newkey, value);
13101
0
            Py_DECREF(newkey);
13102
0
            if (res < 0)
13103
0
                return -1;
13104
0
        }
13105
0
        else if (PyLong_Check(key)) {
13106
0
            if (PyDict_SetItem(newdict, key, value) < 0)
13107
0
                return -1;
13108
0
        }
13109
0
        else {
13110
0
            PyErr_SetString(PyExc_TypeError, "keys in translate table must"
13111
0
                            "be strings or integers");
13112
0
            return -1;
13113
0
        }
13114
0
    }
13115
0
    return 0;
13116
0
}
13117
13118
/*[clinic input]
13119
13120
@staticmethod
13121
str.maketrans as unicode_maketrans
13122
13123
  x: object
13124
13125
  y: unicode=NULL
13126
13127
  z: unicode=NULL
13128
13129
  /
13130
13131
Return a translation table usable for str.translate().
13132
13133
If there is only one argument, it must be a dictionary mapping Unicode
13134
ordinals (integers) or characters to Unicode ordinals, strings or None.
13135
Character keys will be then converted to ordinals.
13136
If there are two arguments, they must be strings of equal length, and
13137
in the resulting dictionary, each character in x will be mapped to the
13138
character at the same position in y. If there is a third argument, it
13139
must be a string, whose characters will be mapped to None in the result.
13140
[clinic start generated code]*/
13141
13142
static PyObject *
13143
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13144
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13145
4
{
13146
4
    PyObject *new = NULL, *key, *value;
13147
4
    Py_ssize_t i = 0;
13148
4
    int res;
13149
13150
4
    new = PyDict_New();
13151
4
    if (!new)
13152
0
        return NULL;
13153
4
    if (y != NULL) {
13154
4
        int x_kind, y_kind, z_kind;
13155
4
        const void *x_data, *y_data, *z_data;
13156
13157
        /* x must be a string too, of equal length */
13158
4
        if (!PyUnicode_Check(x)) {
13159
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13160
0
                            "be a string if there is a second argument");
13161
0
            goto err;
13162
0
        }
13163
4
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13164
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13165
0
                            "arguments must have equal length");
13166
0
            goto err;
13167
0
        }
13168
        /* create entries for translating chars in x to those in y */
13169
4
        x_kind = PyUnicode_KIND(x);
13170
4
        y_kind = PyUnicode_KIND(y);
13171
4
        x_data = PyUnicode_DATA(x);
13172
4
        y_data = PyUnicode_DATA(y);
13173
36
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13174
32
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13175
32
            if (!key)
13176
0
                goto err;
13177
32
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13178
32
            if (!value) {
13179
0
                Py_DECREF(key);
13180
0
                goto err;
13181
0
            }
13182
32
            res = PyDict_SetItem(new, key, value);
13183
32
            Py_DECREF(key);
13184
32
            Py_DECREF(value);
13185
32
            if (res < 0)
13186
0
                goto err;
13187
32
        }
13188
        /* create entries for deleting chars in z */
13189
4
        if (z != NULL) {
13190
0
            z_kind = PyUnicode_KIND(z);
13191
0
            z_data = PyUnicode_DATA(z);
13192
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13193
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13194
0
                if (!key)
13195
0
                    goto err;
13196
0
                res = PyDict_SetItem(new, key, Py_None);
13197
0
                Py_DECREF(key);
13198
0
                if (res < 0)
13199
0
                    goto err;
13200
0
            }
13201
0
        }
13202
4
    } else {
13203
        /* x must be a dict */
13204
0
        if (!PyAnyDict_CheckExact(x)) {
13205
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13206
0
                            "to maketrans it must be a dict");
13207
0
            goto err;
13208
0
        }
13209
        /* copy entries into the new dict, converting string keys to int keys */
13210
0
        int errcode;
13211
0
        Py_BEGIN_CRITICAL_SECTION(x);
13212
0
        errcode = unicode_maketrans_from_dict(x, new);
13213
0
        Py_END_CRITICAL_SECTION();
13214
0
        if (errcode < 0)
13215
0
            goto err;
13216
0
    }
13217
4
    return new;
13218
0
  err:
13219
0
    Py_DECREF(new);
13220
0
    return NULL;
13221
4
}
13222
13223
/*[clinic input]
13224
@permit_long_docstring_body
13225
str.translate as unicode_translate
13226
13227
    table: object
13228
        Translation table, which must be a mapping of Unicode ordinals to
13229
        Unicode ordinals, strings, or None.
13230
    /
13231
13232
Replace each character in the string using the given translation table.
13233
13234
The table must implement lookup/indexing via __getitem__, for instance a
13235
dictionary or list.  If this operation raises LookupError, the character is
13236
left untouched.  Characters mapped to None are deleted.
13237
[clinic start generated code]*/
13238
13239
static PyObject *
13240
unicode_translate(PyObject *self, PyObject *table)
13241
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=699e5fa0ebf9f5e9]*/
13242
12.3k
{
13243
12.3k
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13244
12.3k
}
13245
13246
/*[clinic input]
13247
str.upper as unicode_upper
13248
13249
Return a copy of the string converted to uppercase.
13250
[clinic start generated code]*/
13251
13252
static PyObject *
13253
unicode_upper_impl(PyObject *self)
13254
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13255
306
{
13256
306
    if (PyUnicode_IS_ASCII(self))
13257
306
        return ascii_upper_or_lower(self, 0);
13258
0
    return case_operation(self, do_upper);
13259
306
}
13260
13261
/*[clinic input]
13262
@permit_long_summary
13263
str.zfill as unicode_zfill
13264
13265
    width: Py_ssize_t
13266
    /
13267
13268
Pad a numeric string with zeros on the left, to fill a field of the given width.
13269
13270
The string is never truncated.
13271
[clinic start generated code]*/
13272
13273
static PyObject *
13274
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13275
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=25a4ee0ea3e58ce0]*/
13276
0
{
13277
0
    Py_ssize_t fill;
13278
0
    PyObject *u;
13279
0
    int kind;
13280
0
    const void *data;
13281
0
    Py_UCS4 chr;
13282
13283
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13284
0
        return unicode_result_unchanged(self);
13285
13286
0
    fill = width - PyUnicode_GET_LENGTH(self);
13287
13288
0
    u = pad(self, fill, 0, '0');
13289
13290
0
    if (u == NULL)
13291
0
        return NULL;
13292
13293
0
    kind = PyUnicode_KIND(u);
13294
0
    data = PyUnicode_DATA(u);
13295
0
    chr = PyUnicode_READ(kind, data, fill);
13296
13297
0
    if (chr == '+' || chr == '-') {
13298
        /* move sign to beginning of string */
13299
0
        PyUnicode_WRITE(kind, data, 0, chr);
13300
0
        PyUnicode_WRITE(kind, data, fill, '0');
13301
0
    }
13302
13303
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13304
0
    return u;
13305
0
}
13306
13307
/*[clinic input]
13308
@permit_long_summary
13309
@text_signature "($self, prefix[, start[, end]], /)"
13310
str.startswith as unicode_startswith
13311
13312
    prefix as subobj: object
13313
        A string or a tuple of strings to try.
13314
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13315
        Optional start position. Default: start of the string.
13316
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13317
        Optional stop position. Default: end of the string.
13318
    /
13319
13320
Return True if the string starts with the specified prefix, False otherwise.
13321
[clinic start generated code]*/
13322
13323
static PyObject *
13324
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13325
                        Py_ssize_t end)
13326
/*[clinic end generated code: output=4bd7cfd0803051d4 input=766bdbd33df251dc]*/
13327
43.0M
{
13328
43.0M
    if (PyTuple_Check(subobj)) {
13329
1.86M
        Py_ssize_t i;
13330
6.80M
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13331
4.95M
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13332
4.95M
            if (!PyUnicode_Check(substring)) {
13333
0
                PyErr_Format(PyExc_TypeError,
13334
0
                             "tuple for startswith must only contain str, "
13335
0
                             "not %.100s",
13336
0
                             Py_TYPE(substring)->tp_name);
13337
0
                return NULL;
13338
0
            }
13339
4.95M
            int result = tailmatch(self, substring, start, end, -1);
13340
4.95M
            if (result < 0) {
13341
0
                return NULL;
13342
0
            }
13343
4.95M
            if (result) {
13344
18.6k
                Py_RETURN_TRUE;
13345
18.6k
            }
13346
4.95M
        }
13347
        /* nothing matched */
13348
1.86M
        Py_RETURN_FALSE;
13349
1.86M
    }
13350
41.1M
    if (!PyUnicode_Check(subobj)) {
13351
0
        PyErr_Format(PyExc_TypeError,
13352
0
                     "startswith first arg must be str or "
13353
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13354
0
        return NULL;
13355
0
    }
13356
41.1M
    int result = tailmatch(self, subobj, start, end, -1);
13357
41.1M
    if (result < 0) {
13358
0
        return NULL;
13359
0
    }
13360
41.1M
    return PyBool_FromLong(result);
13361
41.1M
}
13362
13363
13364
/*[clinic input]
13365
@permit_long_summary
13366
@text_signature "($self, suffix[, start[, end]], /)"
13367
str.endswith as unicode_endswith
13368
13369
    suffix as subobj: object
13370
        A string or a tuple of strings to try.
13371
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13372
        Optional start position. Default: start of the string.
13373
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13374
        Optional stop position. Default: end of the string.
13375
    /
13376
13377
Return True if the string ends with the specified suffix, False otherwise.
13378
[clinic start generated code]*/
13379
13380
static PyObject *
13381
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13382
                      Py_ssize_t end)
13383
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=b66bf6d5547ba1aa]*/
13384
10.0M
{
13385
10.0M
    if (PyTuple_Check(subobj)) {
13386
168k
        Py_ssize_t i;
13387
311k
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13388
287k
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13389
287k
            if (!PyUnicode_Check(substring)) {
13390
0
                PyErr_Format(PyExc_TypeError,
13391
0
                             "tuple for endswith must only contain str, "
13392
0
                             "not %.100s",
13393
0
                             Py_TYPE(substring)->tp_name);
13394
0
                return NULL;
13395
0
            }
13396
287k
            int result = tailmatch(self, substring, start, end, +1);
13397
287k
            if (result < 0) {
13398
0
                return NULL;
13399
0
            }
13400
287k
            if (result) {
13401
144k
                Py_RETURN_TRUE;
13402
144k
            }
13403
287k
        }
13404
168k
        Py_RETURN_FALSE;
13405
168k
    }
13406
9.91M
    if (!PyUnicode_Check(subobj)) {
13407
0
        PyErr_Format(PyExc_TypeError,
13408
0
                     "endswith first arg must be str or "
13409
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13410
0
        return NULL;
13411
0
    }
13412
9.91M
    int result = tailmatch(self, subobj, start, end, +1);
13413
9.91M
    if (result < 0) {
13414
0
        return NULL;
13415
0
    }
13416
9.91M
    return PyBool_FromLong(result);
13417
9.91M
}
13418
13419
13420
#include "stringlib/unicode_format.h"
13421
13422
PyDoc_STRVAR(format__doc__,
13423
             "format($self, /, *args, **kwargs)\n\
13424
--\n\
13425
\n\
13426
Return a formatted version of the string, using substitutions from args and kwargs.\n\
13427
The substitutions are identified by braces ('{' and '}').");
13428
13429
PyDoc_STRVAR(format_map__doc__,
13430
             "format_map($self, mapping, /)\n\
13431
--\n\
13432
\n\
13433
Return a formatted version of the string, using substitutions from mapping.\n\
13434
The substitutions are identified by braces ('{' and '}').");
13435
13436
/*[clinic input]
13437
str.__format__ as unicode___format__
13438
13439
    format_spec: unicode
13440
    /
13441
13442
Return a formatted version of the string as described by format_spec.
13443
[clinic start generated code]*/
13444
13445
static PyObject *
13446
unicode___format___impl(PyObject *self, PyObject *format_spec)
13447
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13448
0
{
13449
0
    _PyUnicodeWriter writer;
13450
0
    int ret;
13451
13452
0
    _PyUnicodeWriter_Init(&writer);
13453
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13454
0
                                          self, format_spec, 0,
13455
0
                                          PyUnicode_GET_LENGTH(format_spec));
13456
0
    if (ret == -1) {
13457
0
        _PyUnicodeWriter_Dealloc(&writer);
13458
0
        return NULL;
13459
0
    }
13460
0
    return _PyUnicodeWriter_Finish(&writer);
13461
0
}
13462
13463
/*[clinic input]
13464
str.__sizeof__ as unicode_sizeof
13465
13466
Return the size of the string in memory, in bytes.
13467
[clinic start generated code]*/
13468
13469
static PyObject *
13470
unicode_sizeof_impl(PyObject *self)
13471
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13472
0
{
13473
0
    Py_ssize_t size;
13474
13475
    /* If it's a compact object, account for base structure +
13476
       character data. */
13477
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
13478
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13479
0
    }
13480
0
    else if (PyUnicode_IS_COMPACT(self)) {
13481
0
        size = sizeof(PyCompactUnicodeObject) +
13482
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13483
0
    }
13484
0
    else {
13485
        /* If it is a two-block object, account for base object, and
13486
           for character block if present. */
13487
0
        size = sizeof(PyUnicodeObject);
13488
0
        if (_PyUnicode_DATA_ANY(self))
13489
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
13490
0
                PyUnicode_KIND(self);
13491
0
    }
13492
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
13493
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
13494
13495
0
    return PyLong_FromSsize_t(size);
13496
0
}
13497
13498
static PyObject *
13499
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13500
0
{
13501
0
    PyObject *copy = _PyUnicode_Copy(v);
13502
0
    if (!copy)
13503
0
        return NULL;
13504
0
    return Py_BuildValue("(N)", copy);
13505
0
}
13506
13507
/*
13508
This function searchs the longest common leading whitespace
13509
of all lines in the [src, end).
13510
It returns the length of the common leading whitespace and sets `output` to
13511
point to the beginning of the common leading whitespace if length > 0.
13512
*/
13513
static Py_ssize_t
13514
search_longest_common_leading_whitespace(
13515
    const char *const src,
13516
    const char *const end,
13517
    const char **output)
13518
0
{
13519
    // [_start, _start + _len)
13520
    // describes the current longest common leading whitespace
13521
0
    const char *_start = NULL;
13522
0
    Py_ssize_t _len = 0;
13523
13524
0
    for (const char *iter = src; iter < end; ++iter) {
13525
0
        const char *line_start = iter;
13526
0
        const char *leading_whitespace_end = NULL;
13527
13528
        // scan the whole line
13529
0
        while (iter < end && *iter != '\n') {
13530
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
13531
                /* `iter` points to the first non-whitespace character
13532
                   in this line */
13533
0
                if (iter == line_start) {
13534
                    // some line has no indent, fast exit!
13535
0
                    return 0;
13536
0
                }
13537
0
                leading_whitespace_end = iter;
13538
0
            }
13539
0
            ++iter;
13540
0
        }
13541
13542
        // if this line has all white space, skip it
13543
0
        if (!leading_whitespace_end) {
13544
0
            continue;
13545
0
        }
13546
13547
0
        if (!_start) {
13548
            // update the first leading whitespace
13549
0
            _start = line_start;
13550
0
            _len = leading_whitespace_end - line_start;
13551
0
            assert(_len > 0);
13552
0
        }
13553
0
        else {
13554
            /* We then compare with the current longest leading whitespace.
13555
13556
               [line_start, leading_whitespace_end) is the leading
13557
               whitespace of this line,
13558
13559
               [_start, _start + _len) is the leading whitespace of the
13560
               current longest leading whitespace. */
13561
0
            Py_ssize_t new_len = 0;
13562
0
            const char *_iter = _start, *line_iter = line_start;
13563
13564
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
13565
0
                   && *_iter == *line_iter)
13566
0
            {
13567
0
                ++_iter;
13568
0
                ++line_iter;
13569
0
                ++new_len;
13570
0
            }
13571
13572
0
            _len = new_len;
13573
0
            if (_len == 0) {
13574
                // No common things now, fast exit!
13575
0
                return 0;
13576
0
            }
13577
0
        }
13578
0
    }
13579
13580
0
    assert(_len >= 0);
13581
0
    if (_len > 0) {
13582
0
        *output = _start;
13583
0
    }
13584
0
    return _len;
13585
0
}
13586
13587
/* Dedent a string.
13588
   Intended to dedent Python source. Unlike `textwrap.dedent`, this
13589
   only supports spaces and tabs and doesn't normalize empty lines.
13590
   Return a new reference on success, NULL with exception set on error.
13591
   */
13592
PyObject *
13593
_PyUnicode_Dedent(PyObject *unicode)
13594
0
{
13595
0
    Py_ssize_t src_len = 0;
13596
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
13597
0
    if (!src) {
13598
0
        return NULL;
13599
0
    }
13600
0
    assert(src_len >= 0);
13601
0
    if (src_len == 0) {
13602
0
        return Py_NewRef(unicode);
13603
0
    }
13604
13605
0
    const char *const end = src + src_len;
13606
13607
    // [whitespace_start, whitespace_start + whitespace_len)
13608
    // describes the current longest common leading whitespace
13609
0
    const char *whitespace_start = NULL;
13610
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
13611
0
        src, end, &whitespace_start);
13612
13613
0
    if (whitespace_len == 0) {
13614
0
        return Py_NewRef(unicode);
13615
0
    }
13616
13617
    // now we should trigger a dedent
13618
0
    char *dest = PyMem_Malloc(src_len);
13619
0
    if (!dest) {
13620
0
        PyErr_NoMemory();
13621
0
        return NULL;
13622
0
    }
13623
0
    char *dest_iter = dest;
13624
13625
0
    for (const char *iter = src; iter < end; ++iter) {
13626
0
        const char *line_start = iter;
13627
0
        bool in_leading_space = true;
13628
13629
        // iterate over a line to find the end of a line
13630
0
        while (iter < end && *iter != '\n') {
13631
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
13632
0
                in_leading_space = false;
13633
0
            }
13634
0
            ++iter;
13635
0
        }
13636
13637
        // invariant: *iter == '\n' or iter == end
13638
0
        bool append_newline = iter < end;
13639
13640
        // if this line has all white space, write '\n' and continue
13641
0
        if (in_leading_space && append_newline) {
13642
0
            *dest_iter++ = '\n';
13643
0
            continue;
13644
0
        }
13645
13646
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
13647
            conditionally append '\n' */
13648
13649
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
13650
0
        assert(new_line_len >= 0);
13651
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
13652
13653
0
        dest_iter += new_line_len;
13654
13655
0
        if (append_newline) {
13656
0
            *dest_iter++ = '\n';
13657
0
        }
13658
0
    }
13659
13660
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
13661
0
    PyMem_Free(dest);
13662
0
    return res;
13663
0
}
13664
13665
static PyMethodDef unicode_methods[] = {
13666
    UNICODE_ENCODE_METHODDEF
13667
    UNICODE_REPLACE_METHODDEF
13668
    UNICODE_SPLIT_METHODDEF
13669
    UNICODE_RSPLIT_METHODDEF
13670
    UNICODE_JOIN_METHODDEF
13671
    UNICODE_CAPITALIZE_METHODDEF
13672
    UNICODE_CASEFOLD_METHODDEF
13673
    UNICODE_TITLE_METHODDEF
13674
    UNICODE_CENTER_METHODDEF
13675
    UNICODE_COUNT_METHODDEF
13676
    UNICODE_EXPANDTABS_METHODDEF
13677
    UNICODE_FIND_METHODDEF
13678
    UNICODE_PARTITION_METHODDEF
13679
    UNICODE_INDEX_METHODDEF
13680
    UNICODE_LJUST_METHODDEF
13681
    UNICODE_LOWER_METHODDEF
13682
    UNICODE_LSTRIP_METHODDEF
13683
    UNICODE_RFIND_METHODDEF
13684
    UNICODE_RINDEX_METHODDEF
13685
    UNICODE_RJUST_METHODDEF
13686
    UNICODE_RSTRIP_METHODDEF
13687
    UNICODE_RPARTITION_METHODDEF
13688
    UNICODE_SPLITLINES_METHODDEF
13689
    UNICODE_STRIP_METHODDEF
13690
    UNICODE_SWAPCASE_METHODDEF
13691
    UNICODE_TRANSLATE_METHODDEF
13692
    UNICODE_UPPER_METHODDEF
13693
    UNICODE_STARTSWITH_METHODDEF
13694
    UNICODE_ENDSWITH_METHODDEF
13695
    UNICODE_REMOVEPREFIX_METHODDEF
13696
    UNICODE_REMOVESUFFIX_METHODDEF
13697
    UNICODE_ISASCII_METHODDEF
13698
    UNICODE_ISLOWER_METHODDEF
13699
    UNICODE_ISUPPER_METHODDEF
13700
    UNICODE_ISTITLE_METHODDEF
13701
    UNICODE_ISSPACE_METHODDEF
13702
    UNICODE_ISDECIMAL_METHODDEF
13703
    UNICODE_ISDIGIT_METHODDEF
13704
    UNICODE_ISNUMERIC_METHODDEF
13705
    UNICODE_ISALPHA_METHODDEF
13706
    UNICODE_ISALNUM_METHODDEF
13707
    UNICODE_ISIDENTIFIER_METHODDEF
13708
    UNICODE_ISPRINTABLE_METHODDEF
13709
    UNICODE_ZFILL_METHODDEF
13710
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
13711
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
13712
    UNICODE___FORMAT___METHODDEF
13713
    UNICODE_MAKETRANS_METHODDEF
13714
    UNICODE_SIZEOF_METHODDEF
13715
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
13716
    {NULL, NULL}
13717
};
13718
13719
static PyObject *
13720
unicode_mod(PyObject *v, PyObject *w)
13721
12.6M
{
13722
12.6M
    if (!PyUnicode_Check(v))
13723
0
        Py_RETURN_NOTIMPLEMENTED;
13724
12.6M
    return PyUnicode_Format(v, w);
13725
12.6M
}
13726
13727
static PyNumberMethods unicode_as_number = {
13728
    0,              /*nb_add*/
13729
    0,              /*nb_subtract*/
13730
    0,              /*nb_multiply*/
13731
    unicode_mod,            /*nb_remainder*/
13732
};
13733
13734
static PySequenceMethods unicode_as_sequence = {
13735
    unicode_length,     /* sq_length */
13736
    PyUnicode_Concat,   /* sq_concat */
13737
    _PyUnicode_Repeat,  /* sq_repeat */
13738
    unicode_getitem,    /* sq_item */
13739
    0,                  /* sq_slice */
13740
    0,                  /* sq_ass_item */
13741
    0,                  /* sq_ass_slice */
13742
    PyUnicode_Contains, /* sq_contains */
13743
};
13744
13745
static PyObject*
13746
unicode_subscript(PyObject* self, PyObject* item)
13747
81.4M
{
13748
81.4M
    if (_PyIndex_Check(item)) {
13749
62.2M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13750
62.2M
        if (i == -1 && PyErr_Occurred())
13751
0
            return NULL;
13752
62.2M
        if (i < 0)
13753
69.5k
            i += PyUnicode_GET_LENGTH(self);
13754
62.2M
        return unicode_getitem(self, i);
13755
62.2M
    } else if (PySlice_Check(item)) {
13756
19.1M
        Py_ssize_t start, stop, step, slicelength, i;
13757
19.1M
        size_t cur;
13758
19.1M
        PyObject *result;
13759
19.1M
        const void *src_data;
13760
19.1M
        void *dest_data;
13761
19.1M
        int src_kind, dest_kind;
13762
19.1M
        Py_UCS4 ch, max_char, kind_limit;
13763
13764
19.1M
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
13765
0
            return NULL;
13766
0
        }
13767
19.1M
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13768
19.1M
                                            &start, &stop, step);
13769
13770
19.1M
        if (slicelength <= 0) {
13771
1.37M
            _Py_RETURN_UNICODE_EMPTY();
13772
17.7M
        } else if (start == 0 && step == 1 &&
13773
6.76M
                   slicelength == PyUnicode_GET_LENGTH(self)) {
13774
5.04M
            return unicode_result_unchanged(self);
13775
12.7M
        } else if (step == 1) {
13776
12.7M
            return PyUnicode_Substring(self,
13777
12.7M
                                       start, start + slicelength);
13778
12.7M
        }
13779
        /* General case */
13780
0
        src_kind = PyUnicode_KIND(self);
13781
0
        src_data = PyUnicode_DATA(self);
13782
0
        if (!PyUnicode_IS_ASCII(self)) {
13783
0
            kind_limit = kind_maxchar_limit(src_kind);
13784
0
            max_char = 0;
13785
0
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13786
0
                ch = PyUnicode_READ(src_kind, src_data, cur);
13787
0
                if (ch > max_char) {
13788
0
                    max_char = ch;
13789
0
                    if (max_char >= kind_limit)
13790
0
                        break;
13791
0
                }
13792
0
            }
13793
0
        }
13794
0
        else
13795
0
            max_char = 127;
13796
0
        result = PyUnicode_New(slicelength, max_char);
13797
0
        if (result == NULL)
13798
0
            return NULL;
13799
0
        dest_kind = PyUnicode_KIND(result);
13800
0
        dest_data = PyUnicode_DATA(result);
13801
13802
0
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13803
0
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13804
0
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13805
0
        }
13806
0
        assert(_PyUnicode_CheckConsistency(result, 1));
13807
0
        return result;
13808
0
    } else {
13809
0
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
13810
0
                     Py_TYPE(item)->tp_name);
13811
0
        return NULL;
13812
0
    }
13813
81.4M
}
13814
13815
static PyMappingMethods unicode_as_mapping = {
13816
    unicode_length,     /* mp_length */
13817
    unicode_subscript,  /* mp_subscript */
13818
    0,                  /* mp_ass_subscript */
13819
};
13820
13821
13822
static PyObject *
13823
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
13824
13825
/*[clinic input]
13826
@classmethod
13827
str.__new__ as unicode_new
13828
13829
    object as x: object = NULL
13830
    encoding: str = NULL
13831
    errors: str = NULL
13832
13833
[clinic start generated code]*/
13834
13835
static PyObject *
13836
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
13837
                 const char *errors)
13838
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
13839
15.5M
{
13840
15.5M
    PyObject *unicode;
13841
15.5M
    if (x == NULL) {
13842
0
        unicode = _PyUnicode_GetEmpty();
13843
0
    }
13844
15.5M
    else if (encoding == NULL && errors == NULL) {
13845
15.5M
        unicode = PyObject_Str(x);
13846
15.5M
    }
13847
0
    else {
13848
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
13849
0
    }
13850
13851
15.5M
    if (unicode != NULL && type != &PyUnicode_Type) {
13852
15.5M
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
13853
15.5M
    }
13854
15.5M
    return unicode;
13855
15.5M
}
13856
13857
static const char *
13858
arg_as_utf8(PyObject *obj, const char *name)
13859
2.84M
{
13860
2.84M
    if (!PyUnicode_Check(obj)) {
13861
0
        PyErr_Format(PyExc_TypeError,
13862
0
                     "str() argument '%s' must be str, not %T",
13863
0
                     name, obj);
13864
0
        return NULL;
13865
0
    }
13866
2.84M
    return _PyUnicode_AsUTF8NoNUL(obj);
13867
2.84M
}
13868
13869
static PyObject *
13870
unicode_vectorcall(PyObject *type, PyObject *const *args,
13871
                   size_t nargsf, PyObject *kwnames)
13872
2.52M
{
13873
2.52M
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
13874
13875
2.52M
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
13876
2.52M
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
13877
        // Fallback to unicode_new()
13878
0
        PyObject *tuple = PyTuple_FromArray(args, nargs);
13879
0
        if (tuple == NULL) {
13880
0
            return NULL;
13881
0
        }
13882
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
13883
0
        if (dict == NULL) {
13884
0
            Py_DECREF(tuple);
13885
0
            return NULL;
13886
0
        }
13887
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
13888
0
        Py_DECREF(tuple);
13889
0
        Py_DECREF(dict);
13890
0
        return ret;
13891
0
    }
13892
2.52M
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
13893
0
        return NULL;
13894
0
    }
13895
2.52M
    if (nargs == 0) {
13896
4.78k
        return _PyUnicode_GetEmpty();
13897
4.78k
    }
13898
2.51M
    PyObject *object = args[0];
13899
2.51M
    if (nargs == 1) {
13900
1.17k
        return PyObject_Str(object);
13901
1.17k
    }
13902
2.51M
    const char *encoding = arg_as_utf8(args[1], "encoding");
13903
2.51M
    if (encoding == NULL) {
13904
0
        return NULL;
13905
0
    }
13906
2.51M
    const char *errors = NULL;
13907
2.51M
    if (nargs == 3) {
13908
329k
        errors = arg_as_utf8(args[2], "errors");
13909
329k
        if (errors == NULL) {
13910
0
            return NULL;
13911
0
        }
13912
329k
    }
13913
2.51M
    return PyUnicode_FromEncodedObject(object, encoding, errors);
13914
2.51M
}
13915
13916
static PyObject *
13917
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
13918
15.5M
{
13919
15.5M
    PyObject *self;
13920
15.5M
    Py_ssize_t length, char_size;
13921
15.5M
    int share_utf8;
13922
15.5M
    int kind;
13923
15.5M
    void *data;
13924
13925
15.5M
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13926
15.5M
    assert(_PyUnicode_CHECK(unicode));
13927
13928
15.5M
    self = type->tp_alloc(type, 0);
13929
15.5M
    if (self == NULL) {
13930
0
        return NULL;
13931
0
    }
13932
15.5M
    kind = PyUnicode_KIND(unicode);
13933
15.5M
    length = PyUnicode_GET_LENGTH(unicode);
13934
13935
15.5M
    _PyUnicode_LENGTH(self) = length;
13936
#ifdef Py_DEBUG
13937
    _PyUnicode_HASH(self) = -1;
13938
#else
13939
15.5M
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13940
15.5M
#endif
13941
15.5M
    _PyUnicode_STATE(self).interned = 0;
13942
15.5M
    _PyUnicode_STATE(self).kind = kind;
13943
15.5M
    _PyUnicode_STATE(self).compact = 0;
13944
15.5M
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13945
15.5M
    _PyUnicode_STATE(self).statically_allocated = 0;
13946
15.5M
    PyUnicode_SET_UTF8_LENGTH(self, 0);
13947
15.5M
    PyUnicode_SET_UTF8(self, NULL);
13948
15.5M
    _PyUnicode_DATA_ANY(self) = NULL;
13949
13950
15.5M
    share_utf8 = 0;
13951
15.5M
    if (kind == PyUnicode_1BYTE_KIND) {
13952
13.1M
        char_size = 1;
13953
13.1M
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13954
13.1M
            share_utf8 = 1;
13955
13.1M
    }
13956
2.43M
    else if (kind == PyUnicode_2BYTE_KIND) {
13957
2.36M
        char_size = 2;
13958
2.36M
    }
13959
61.3k
    else {
13960
61.3k
        assert(kind == PyUnicode_4BYTE_KIND);
13961
61.3k
        char_size = 4;
13962
61.3k
    }
13963
13964
    /* Ensure we won't overflow the length. */
13965
15.5M
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13966
0
        PyErr_NoMemory();
13967
0
        goto onError;
13968
0
    }
13969
15.5M
    data = PyMem_Malloc((length + 1) * char_size);
13970
15.5M
    if (data == NULL) {
13971
0
        PyErr_NoMemory();
13972
0
        goto onError;
13973
0
    }
13974
13975
15.5M
    _PyUnicode_DATA_ANY(self) = data;
13976
15.5M
    if (share_utf8) {
13977
13.1M
        PyUnicode_SET_UTF8_LENGTH(self, length);
13978
13.1M
        PyUnicode_SET_UTF8(self, data);
13979
13.1M
    }
13980
13981
15.5M
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
13982
15.5M
    assert(_PyUnicode_CheckConsistency(self, 1));
13983
#ifdef Py_DEBUG
13984
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13985
#endif
13986
15.5M
    return self;
13987
13988
0
onError:
13989
0
    Py_DECREF(self);
13990
0
    return NULL;
13991
15.5M
}
13992
13993
static _PyObjectIndexPair
13994
unicode_iteritem(PyObject *obj, Py_ssize_t index)
13995
47.3M
{
13996
47.3M
    if (index >= PyUnicode_GET_LENGTH(obj)) {
13997
3.93M
        return (_PyObjectIndexPair) { .object = NULL, .index = index };
13998
3.93M
    }
13999
43.3M
    const void *data = PyUnicode_DATA(obj);
14000
43.3M
    int kind = PyUnicode_KIND(obj);
14001
43.3M
    Py_UCS4 ch = PyUnicode_READ(kind, data, index);
14002
43.3M
    PyObject *result = unicode_char(ch);
14003
43.3M
    index = (result == NULL) ? -1 : index + 1;
14004
43.3M
    return (_PyObjectIndexPair) { .object = result, .index = index };
14005
47.3M
}
14006
14007
void
14008
_PyUnicode_ExactDealloc(PyObject *op)
14009
74.8M
{
14010
74.8M
    assert(PyUnicode_CheckExact(op));
14011
74.8M
    unicode_dealloc(op);
14012
74.8M
}
14013
14014
PyDoc_STRVAR(unicode_doc,
14015
"str(object='') -> str\n\
14016
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14017
\n\
14018
Create a new string object from the given object. If encoding or\n\
14019
errors is specified, then the object must expose a data buffer\n\
14020
that will be decoded using the given encoding and error handler.\n\
14021
Otherwise, returns the result of object.__str__() (if defined)\n\
14022
or repr(object).\n\
14023
encoding defaults to 'utf-8'.\n\
14024
errors defaults to 'strict'.");
14025
14026
static PyObject *unicode_iter(PyObject *seq);
14027
14028
PyTypeObject PyUnicode_Type = {
14029
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14030
    "str",                        /* tp_name */
14031
    sizeof(PyUnicodeObject),      /* tp_basicsize */
14032
    0,                            /* tp_itemsize */
14033
    /* Slots */
14034
    unicode_dealloc,              /* tp_dealloc */
14035
    0,                            /* tp_vectorcall_offset */
14036
    0,                            /* tp_getattr */
14037
    0,                            /* tp_setattr */
14038
    0,                            /* tp_as_async */
14039
    unicode_repr,                 /* tp_repr */
14040
    &unicode_as_number,           /* tp_as_number */
14041
    &unicode_as_sequence,         /* tp_as_sequence */
14042
    &unicode_as_mapping,          /* tp_as_mapping */
14043
    unicode_hash,                 /* tp_hash*/
14044
    0,                            /* tp_call*/
14045
    unicode_str,                  /* tp_str */
14046
    PyObject_GenericGetAttr,      /* tp_getattro */
14047
    0,                            /* tp_setattro */
14048
    0,                            /* tp_as_buffer */
14049
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14050
        Py_TPFLAGS_UNICODE_SUBCLASS |
14051
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
14052
    unicode_doc,                  /* tp_doc */
14053
    0,                            /* tp_traverse */
14054
    0,                            /* tp_clear */
14055
    PyUnicode_RichCompare,        /* tp_richcompare */
14056
    0,                            /* tp_weaklistoffset */
14057
    unicode_iter,                 /* tp_iter */
14058
    0,                            /* tp_iternext */
14059
    unicode_methods,              /* tp_methods */
14060
    0,                            /* tp_members */
14061
    0,                            /* tp_getset */
14062
    0,                            /* tp_base */
14063
    0,                            /* tp_dict */
14064
    0,                            /* tp_descr_get */
14065
    0,                            /* tp_descr_set */
14066
    0,                            /* tp_dictoffset */
14067
    0,                            /* tp_init */
14068
    0,                            /* tp_alloc */
14069
    unicode_new,                  /* tp_new */
14070
    PyObject_Free,                /* tp_free */
14071
    .tp_vectorcall = unicode_vectorcall,
14072
    ._tp_iteritem = unicode_iteritem,
14073
};
14074
14075
/* Initialize the Unicode implementation */
14076
14077
static void
14078
_init_global_state(void)
14079
37
{
14080
37
    static int initialized = 0;
14081
37
    if (initialized) {
14082
0
        return;
14083
0
    }
14084
37
    initialized = 1;
14085
14086
    /* initialize the linebreak bloom filter */
14087
37
    const Py_UCS2 linebreak[] = {
14088
37
        0x000A, /* LINE FEED */
14089
37
        0x000D, /* CARRIAGE RETURN */
14090
37
        0x001C, /* FILE SEPARATOR */
14091
37
        0x001D, /* GROUP SEPARATOR */
14092
37
        0x001E, /* RECORD SEPARATOR */
14093
37
        0x0085, /* NEXT LINE */
14094
37
        0x2028, /* LINE SEPARATOR */
14095
37
        0x2029, /* PARAGRAPH SEPARATOR */
14096
37
    };
14097
37
    bloom_linebreak = make_bloom_mask(
14098
37
        PyUnicode_2BYTE_KIND, linebreak,
14099
37
        Py_ARRAY_LENGTH(linebreak));
14100
37
}
14101
14102
void
14103
_PyUnicode_InitState(PyInterpreterState *interp)
14104
37
{
14105
37
    if (!_Py_IsMainInterpreter(interp)) {
14106
0
        return;
14107
0
    }
14108
37
    _init_global_state();
14109
37
}
14110
14111
14112
PyStatus
14113
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
14114
37
{
14115
37
    if (_Py_IsMainInterpreter(interp)) {
14116
37
        PyStatus status = init_global_interned_strings(interp);
14117
37
        if (_PyStatus_EXCEPTION(status)) {
14118
0
            return status;
14119
0
        }
14120
37
    }
14121
37
    assert(INTERNED_STRINGS);
14122
14123
37
    if (init_interned_dict(interp)) {
14124
0
        PyErr_Clear();
14125
0
        return _PyStatus_ERR("failed to create interned dict");
14126
0
    }
14127
14128
37
    return _PyStatus_OK();
14129
37
}
14130
14131
14132
PyStatus
14133
_PyUnicode_InitTypes(PyInterpreterState *interp)
14134
37
{
14135
37
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
14136
0
        goto error;
14137
0
    }
14138
37
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
14139
0
        goto error;
14140
0
    }
14141
37
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
14142
0
        goto error;
14143
0
    }
14144
37
    return _PyStatus_OK();
14145
14146
0
error:
14147
0
    return _PyStatus_ERR("Can't initialize unicode types");
14148
37
}
14149
14150
static /* non-null */ PyObject*
14151
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
14152
41.3k
{
14153
    // Note that this steals a reference to `s`, but in many cases that
14154
    // stolen ref is returned, requiring no decref/incref.
14155
14156
41.3k
    assert(s != NULL);
14157
41.3k
    assert(_PyUnicode_CHECK(s));
14158
41.3k
    assert(_PyUnicode_STATE(s).statically_allocated);
14159
41.3k
    assert(!PyUnicode_CHECK_INTERNED(s));
14160
14161
#ifdef Py_DEBUG
14162
    /* We must not add process-global interned string if there's already a
14163
     * per-interpreter interned_dict, which might contain duplicates.
14164
     */
14165
    PyObject *interned = get_interned_dict(interp);
14166
    assert(interned == NULL);
14167
#endif
14168
14169
    /* Look in the global cache first. */
14170
41.3k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14171
    /* We should only init each string once */
14172
41.3k
    assert(r == NULL);
14173
    /* but just in case (for the non-debug build), handle this */
14174
41.3k
    if (r != NULL && r != s) {
14175
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
14176
0
        assert(_PyUnicode_CHECK(r));
14177
0
        Py_DECREF(s);
14178
0
        return Py_NewRef(r);
14179
0
    }
14180
14181
41.3k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
14182
0
        Py_FatalError("failed to intern static string");
14183
0
    }
14184
14185
41.3k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
14186
41.3k
    return s;
14187
41.3k
}
14188
14189
void
14190
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
14191
41.3k
{
14192
    // This should only be called as part of runtime initialization
14193
41.3k
    assert(!Py_IsInitialized());
14194
14195
41.3k
    *p = intern_static(interp, *p);
14196
41.3k
    assert(*p);
14197
41.3k
}
14198
14199
static void
14200
immortalize_interned(PyObject *s)
14201
288k
{
14202
288k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
14203
288k
    assert(!_Py_IsImmortal(s));
14204
#ifdef Py_REF_DEBUG
14205
    /* The reference count value should be excluded from the RefTotal.
14206
       The decrements to these objects will not be registered so they
14207
       need to be accounted for in here. */
14208
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
14209
        _Py_DecRefTotal(_PyThreadState_GET());
14210
    }
14211
#endif
14212
288k
    _Py_SetImmortal(s);
14213
    // The switch to SSTATE_INTERNED_IMMORTAL must be the last thing done here
14214
    // to synchronize with the check in intern_common() that avoids locking if
14215
    // the string is already immortal.
14216
288k
    FT_ATOMIC_STORE_UINT8(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
14217
288k
}
14218
14219
#ifdef Py_GIL_DISABLED
14220
static bool
14221
can_immortalize_safely(PyObject *s)
14222
{
14223
    if (_Py_IsOwnedByCurrentThread(s) || _Py_IsImmortal(s)) {
14224
        return true;
14225
    }
14226
    Py_ssize_t shared = _Py_atomic_load_ssize(&s->ob_ref_shared);
14227
    return _Py_REF_IS_MERGED(shared);
14228
}
14229
#endif
14230
14231
static /* non-null */ PyObject*
14232
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
14233
              bool immortalize)
14234
92.6M
{
14235
    // Note that this steals a reference to `s`, but in many cases that
14236
    // stolen ref is returned, requiring no decref/incref.
14237
14238
#ifdef Py_DEBUG
14239
    assert(s != NULL);
14240
    assert(_PyUnicode_CHECK(s));
14241
#else
14242
92.6M
    if (s == NULL || !PyUnicode_Check(s)) {
14243
0
        return s;
14244
0
    }
14245
92.6M
#endif
14246
14247
    /* If it's a subclass, we don't really know what putting
14248
       it in the interned dict might do. */
14249
92.6M
    if (!PyUnicode_CheckExact(s)) {
14250
0
        return s;
14251
0
    }
14252
14253
    /* Is it already interned? */
14254
92.6M
    switch (PyUnicode_CHECK_INTERNED(s)) {
14255
6.04M
        case SSTATE_NOT_INTERNED:
14256
            // no, go on
14257
6.04M
            break;
14258
29.2k
        case SSTATE_INTERNED_MORTAL:
14259
29.2k
#ifndef Py_GIL_DISABLED
14260
            // yes but we might need to make it immortal
14261
29.2k
            if (immortalize) {
14262
1.77k
                immortalize_interned(s);
14263
1.77k
            }
14264
29.2k
            return s;
14265
#else
14266
            // not fully interned yet; fall through to the locking path
14267
            break;
14268
#endif
14269
86.6M
        default:
14270
            // all done
14271
86.6M
            return s;
14272
92.6M
    }
14273
14274
    /* Statically allocated strings must be already interned. */
14275
92.6M
    assert(!_PyUnicode_STATE(s).statically_allocated);
14276
14277
#if Py_GIL_DISABLED
14278
    /* In the free-threaded build, all interned strings are immortal */
14279
    immortalize = 1;
14280
#endif
14281
14282
    /* If it's already immortal, intern it as such */
14283
6.04M
    if (_Py_IsImmortal(s)) {
14284
0
        immortalize = 1;
14285
0
    }
14286
14287
    /* if it's a short string, get the singleton */
14288
6.04M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
14289
19.0k
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
14290
0
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
14291
0
        assert(PyUnicode_CHECK_INTERNED(r));
14292
0
        Py_DECREF(s);
14293
0
        return r;
14294
0
    }
14295
#ifdef Py_DEBUG
14296
    assert(!unicode_is_singleton(s));
14297
#endif
14298
14299
    /* Look in the global cache now. */
14300
6.04M
    {
14301
6.04M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14302
6.04M
        if (r != NULL) {
14303
531k
            assert(_PyUnicode_STATE(r).statically_allocated);
14304
531k
            assert(r != s);  // r must be statically_allocated; s is not
14305
531k
            Py_DECREF(s);
14306
531k
            return Py_NewRef(r);
14307
531k
        }
14308
6.04M
    }
14309
14310
    /* Do a setdefault on the per-interpreter cache. */
14311
5.51M
    PyObject *interned = get_interned_dict(interp);
14312
5.51M
    assert(interned != NULL);
14313
#ifdef Py_GIL_DISABLED
14314
#  define INTERN_MUTEX &_Py_INTERP_CACHED_OBJECT(interp, interned_mutex)
14315
    // Lock-free fast path: check if there's already an interned copy that
14316
    // is in its final immortal state.
14317
    PyObject *r;
14318
    int res = PyDict_GetItemRef(interned, s, &r);
14319
    if (res < 0) {
14320
        PyErr_Clear();
14321
        return s;
14322
    }
14323
    if (res > 0) {
14324
        unsigned int state = _Py_atomic_load_uint8(&_PyUnicode_STATE(r).interned);
14325
        if (state == SSTATE_INTERNED_IMMORTAL) {
14326
            Py_DECREF(s);
14327
            return r;
14328
        }
14329
        // Not yet fully interned; fall through to the locking path.
14330
        Py_DECREF(r);
14331
    }
14332
#endif
14333
14334
#ifdef Py_GIL_DISABLED
14335
    // Immortalization writes to the refcount fields non-atomically. That
14336
    // races with Py_INCREF / Py_DECREF on the thread that owns `s`. If we
14337
    // don't own it (and its refcount hasn't been merged), intern a copy
14338
    // we own instead.
14339
    if (!can_immortalize_safely(s)) {
14340
        PyObject *copy = _PyUnicode_Copy(s);
14341
        if (copy == NULL) {
14342
            PyErr_Clear();
14343
            return s;
14344
        }
14345
        Py_DECREF(s);
14346
        s = copy;
14347
    }
14348
#endif
14349
14350
5.51M
    FT_MUTEX_LOCK(INTERN_MUTEX);
14351
5.51M
    PyObject *t;
14352
5.51M
    {
14353
5.51M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
14354
5.51M
        if (res < 0) {
14355
0
            PyErr_Clear();
14356
0
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14357
0
            return s;
14358
0
        }
14359
5.51M
        else if (res == 1) {
14360
            // value was already present (not inserted)
14361
4.74M
            Py_DECREF(s);
14362
4.74M
            if (immortalize &&
14363
1.10M
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
14364
10.1k
                immortalize_interned(t);
14365
10.1k
            }
14366
4.74M
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14367
4.74M
            return t;
14368
4.74M
        }
14369
768k
        else {
14370
            // value was newly inserted
14371
768k
            assert (s == t);
14372
768k
            Py_DECREF(t);
14373
768k
        }
14374
5.51M
    }
14375
14376
    /* NOT_INTERNED -> INTERNED_MORTAL */
14377
14378
5.51M
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
14379
14380
768k
    if (!_Py_IsImmortal(s)) {
14381
        /* The two references in interned dict (key and value) are not counted.
14382
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
14383
768k
        Py_DECREF(s);
14384
768k
        Py_DECREF(s);
14385
768k
    }
14386
768k
    FT_ATOMIC_STORE_UINT8(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
14387
14388
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
14389
14390
#ifdef Py_DEBUG
14391
    if (_Py_IsImmortal(s)) {
14392
        assert(immortalize);
14393
    }
14394
#endif
14395
768k
    if (immortalize) {
14396
276k
        immortalize_interned(s);
14397
276k
    }
14398
14399
768k
    FT_MUTEX_UNLOCK(INTERN_MUTEX);
14400
768k
    return s;
14401
5.51M
}
14402
14403
void
14404
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
14405
15.2M
{
14406
15.2M
    *p = intern_common(interp, *p, 1);
14407
15.2M
    assert(*p);
14408
15.2M
}
14409
14410
void
14411
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
14412
77.4M
{
14413
77.4M
    *p = intern_common(interp, *p, 0);
14414
77.4M
    assert(*p);
14415
77.4M
}
14416
14417
14418
void
14419
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
14420
0
{
14421
0
    _PyUnicode_InternImmortal(interp, p);
14422
0
    return;
14423
0
}
14424
14425
void
14426
PyUnicode_InternInPlace(PyObject **p)
14427
0
{
14428
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14429
0
    _PyUnicode_InternMortal(interp, p);
14430
0
}
14431
14432
// Public-looking name kept for the stable ABI; user should not call this:
14433
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
14434
void
14435
PyUnicode_InternImmortal(PyObject **p)
14436
0
{
14437
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14438
0
    _PyUnicode_InternImmortal(interp, p);
14439
0
}
14440
14441
PyObject *
14442
PyUnicode_InternFromString(const char *cp)
14443
1.31M
{
14444
1.31M
    PyObject *s = PyUnicode_FromString(cp);
14445
1.31M
    if (s == NULL) {
14446
0
        return NULL;
14447
0
    }
14448
1.31M
    PyInterpreterState *interp = _PyInterpreterState_GET();
14449
1.31M
    _PyUnicode_InternMortal(interp, &s);
14450
1.31M
    return s;
14451
1.31M
}
14452
14453
14454
void
14455
_PyUnicode_ClearInterned(PyInterpreterState *interp)
14456
0
{
14457
0
    PyObject *interned = get_interned_dict(interp);
14458
0
    if (interned == NULL) {
14459
0
        return;
14460
0
    }
14461
0
    assert(PyDict_CheckExact(interned));
14462
14463
0
    if (has_shared_intern_dict(interp)) {
14464
        // the dict doesn't belong to this interpreter, skip the debug
14465
        // checks on it and just clear the pointer to it
14466
0
        clear_interned_dict(interp);
14467
0
        return;
14468
0
    }
14469
14470
#ifdef INTERNED_STATS
14471
    fprintf(stderr, "releasing %zd interned strings\n",
14472
            PyDict_GET_SIZE(interned));
14473
14474
    Py_ssize_t total_length = 0;
14475
#endif
14476
0
    Py_ssize_t pos = 0;
14477
0
    PyObject *s, *ignored_value;
14478
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
14479
0
        int shared = 0;
14480
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
14481
0
        case SSTATE_INTERNED_IMMORTAL:
14482
            /* Make immortal interned strings mortal again. */
14483
            // Skip the Immortal Instance check and restore
14484
            // the two references (key and value) ignored
14485
            // by PyUnicode_InternInPlace().
14486
0
            _Py_SetMortal(s, 2);
14487
#ifdef Py_REF_DEBUG
14488
            /* let's be pedantic with the ref total */
14489
            _Py_IncRefTotal(_PyThreadState_GET());
14490
            _Py_IncRefTotal(_PyThreadState_GET());
14491
#endif
14492
#ifdef INTERNED_STATS
14493
            total_length += PyUnicode_GET_LENGTH(s);
14494
#endif
14495
0
            break;
14496
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
14497
            /* It is shared between interpreters, so we should unmark it
14498
               only when this is the last interpreter in which it's
14499
               interned.  We immortalize all the statically initialized
14500
               strings during startup, so we can rely on the
14501
               main interpreter to be the last one. */
14502
0
            if (!_Py_IsMainInterpreter(interp)) {
14503
0
                shared = 1;
14504
0
            }
14505
0
            break;
14506
0
        case SSTATE_INTERNED_MORTAL:
14507
            // Restore 2 references held by the interned dict; these will
14508
            // be decref'd by clear_interned_dict's PyDict_Clear.
14509
0
            _Py_RefcntAdd(s, 2);
14510
#ifdef Py_REF_DEBUG
14511
            /* let's be pedantic with the ref total */
14512
            _Py_IncRefTotal(_PyThreadState_GET());
14513
            _Py_IncRefTotal(_PyThreadState_GET());
14514
#endif
14515
0
            break;
14516
0
        case SSTATE_NOT_INTERNED:
14517
0
            _Py_FALLTHROUGH;
14518
0
        default:
14519
0
            Py_UNREACHABLE();
14520
0
        }
14521
0
        if (!shared) {
14522
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
14523
0
        }
14524
0
    }
14525
#ifdef INTERNED_STATS
14526
    fprintf(stderr,
14527
            "total length of all interned strings: %zd characters\n",
14528
            total_length);
14529
#endif
14530
14531
0
    struct _Py_unicode_state *state = &interp->unicode;
14532
0
    struct _Py_unicode_ids *ids = &state->ids;
14533
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
14534
0
        Py_XINCREF(ids->array[i]);
14535
0
    }
14536
0
    clear_interned_dict(interp);
14537
0
    if (_Py_IsMainInterpreter(interp)) {
14538
0
        clear_global_interned_strings();
14539
0
    }
14540
0
}
14541
14542
14543
/********************* Unicode Iterator **************************/
14544
14545
typedef struct {
14546
    PyObject_HEAD
14547
    Py_ssize_t it_index;
14548
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14549
} unicodeiterobject;
14550
14551
static void
14552
unicodeiter_dealloc(PyObject *op)
14553
1.65M
{
14554
1.65M
    unicodeiterobject *it = (unicodeiterobject *)op;
14555
1.65M
    _PyObject_GC_UNTRACK(it);
14556
1.65M
    Py_XDECREF(it->it_seq);
14557
1.65M
    PyObject_GC_Del(it);
14558
1.65M
}
14559
14560
static int
14561
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
14562
14
{
14563
14
    unicodeiterobject *it = (unicodeiterobject *)op;
14564
14
    Py_VISIT(it->it_seq);
14565
14
    return 0;
14566
14
}
14567
14568
static PyObject *
14569
unicodeiter_next(PyObject *op)
14570
55.7M
{
14571
55.7M
    unicodeiterobject *it = (unicodeiterobject *)op;
14572
55.7M
    PyObject *seq;
14573
14574
55.7M
    assert(it != NULL);
14575
55.7M
    seq = it->it_seq;
14576
55.7M
    if (seq == NULL)
14577
0
        return NULL;
14578
55.7M
    assert(_PyUnicode_CHECK(seq));
14579
14580
55.7M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14581
54.3M
        int kind = PyUnicode_KIND(seq);
14582
54.3M
        const void *data = PyUnicode_DATA(seq);
14583
54.3M
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14584
54.3M
        it->it_index++;
14585
54.3M
        return unicode_char(chr);
14586
54.3M
    }
14587
14588
1.39M
    it->it_seq = NULL;
14589
1.39M
    Py_DECREF(seq);
14590
1.39M
    return NULL;
14591
55.7M
}
14592
14593
static PyObject *
14594
unicode_ascii_iter_next(PyObject *op)
14595
4.38M
{
14596
4.38M
    unicodeiterobject *it = (unicodeiterobject *)op;
14597
4.38M
    assert(it != NULL);
14598
4.38M
    PyObject *seq = it->it_seq;
14599
4.38M
    if (seq == NULL) {
14600
0
        return NULL;
14601
0
    }
14602
4.38M
    assert(_PyUnicode_CHECK(seq));
14603
4.38M
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
14604
4.38M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14605
4.13M
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
14606
4.13M
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
14607
4.13M
                                              data, it->it_index);
14608
4.13M
        it->it_index++;
14609
4.13M
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
14610
4.13M
    }
14611
246k
    it->it_seq = NULL;
14612
246k
    Py_DECREF(seq);
14613
246k
    return NULL;
14614
4.38M
}
14615
14616
static PyObject *
14617
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
14618
1.20M
{
14619
1.20M
    unicodeiterobject *it = (unicodeiterobject *)op;
14620
1.20M
    Py_ssize_t len = 0;
14621
1.20M
    if (it->it_seq)
14622
1.20M
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14623
1.20M
    return PyLong_FromSsize_t(len);
14624
1.20M
}
14625
14626
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14627
14628
static PyObject *
14629
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
14630
0
{
14631
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14632
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
14633
14634
    /* _PyEval_GetBuiltin can invoke arbitrary code,
14635
     * call must be before access of iterator pointers.
14636
     * see issue #101765 */
14637
14638
0
    if (it->it_seq != NULL) {
14639
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
14640
0
    } else {
14641
0
        PyObject *u = _PyUnicode_GetEmpty();
14642
0
        if (u == NULL) {
14643
0
            Py_XDECREF(iter);
14644
0
            return NULL;
14645
0
        }
14646
0
        return Py_BuildValue("N(N)", iter, u);
14647
0
    }
14648
0
}
14649
14650
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14651
14652
static PyObject *
14653
unicodeiter_setstate(PyObject *op, PyObject *state)
14654
0
{
14655
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14656
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
14657
0
    if (index == -1 && PyErr_Occurred())
14658
0
        return NULL;
14659
0
    if (it->it_seq != NULL) {
14660
0
        if (index < 0)
14661
0
            index = 0;
14662
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
14663
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
14664
0
        it->it_index = index;
14665
0
    }
14666
0
    Py_RETURN_NONE;
14667
0
}
14668
14669
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14670
14671
static PyMethodDef unicodeiter_methods[] = {
14672
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
14673
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
14674
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
14675
    {NULL,      NULL}       /* sentinel */
14676
};
14677
14678
PyTypeObject PyUnicodeIter_Type = {
14679
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14680
    "str_iterator",         /* tp_name */
14681
    sizeof(unicodeiterobject),      /* tp_basicsize */
14682
    0,                  /* tp_itemsize */
14683
    /* methods */
14684
    unicodeiter_dealloc,/* tp_dealloc */
14685
    0,                  /* tp_vectorcall_offset */
14686
    0,                  /* tp_getattr */
14687
    0,                  /* tp_setattr */
14688
    0,                  /* tp_as_async */
14689
    0,                  /* tp_repr */
14690
    0,                  /* tp_as_number */
14691
    0,                  /* tp_as_sequence */
14692
    0,                  /* tp_as_mapping */
14693
    0,                  /* tp_hash */
14694
    0,                  /* tp_call */
14695
    0,                  /* tp_str */
14696
    PyObject_GenericGetAttr,        /* tp_getattro */
14697
    0,                  /* tp_setattro */
14698
    0,                  /* tp_as_buffer */
14699
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14700
    0,                  /* tp_doc */
14701
    unicodeiter_traverse, /* tp_traverse */
14702
    0,                  /* tp_clear */
14703
    0,                  /* tp_richcompare */
14704
    0,                  /* tp_weaklistoffset */
14705
    PyObject_SelfIter,          /* tp_iter */
14706
    unicodeiter_next,   /* tp_iternext */
14707
    unicodeiter_methods,            /* tp_methods */
14708
    0,
14709
};
14710
14711
PyTypeObject _PyUnicodeASCIIIter_Type = {
14712
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14713
    .tp_name = "str_ascii_iterator",
14714
    .tp_basicsize = sizeof(unicodeiterobject),
14715
    .tp_dealloc = unicodeiter_dealloc,
14716
    .tp_getattro = PyObject_GenericGetAttr,
14717
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
14718
    .tp_traverse = unicodeiter_traverse,
14719
    .tp_iter = PyObject_SelfIter,
14720
    .tp_iternext = unicode_ascii_iter_next,
14721
    .tp_methods = unicodeiter_methods,
14722
};
14723
14724
static PyObject *
14725
unicode_iter(PyObject *seq)
14726
1.65M
{
14727
1.65M
    unicodeiterobject *it;
14728
14729
1.65M
    if (!PyUnicode_Check(seq)) {
14730
0
        PyErr_BadInternalCall();
14731
0
        return NULL;
14732
0
    }
14733
1.65M
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
14734
259k
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
14735
259k
    }
14736
1.39M
    else {
14737
1.39M
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14738
1.39M
    }
14739
1.65M
    if (it == NULL)
14740
0
        return NULL;
14741
1.65M
    it->it_index = 0;
14742
1.65M
    it->it_seq = Py_NewRef(seq);
14743
1.65M
    _PyObject_GC_TRACK(it);
14744
1.65M
    return (PyObject *)it;
14745
1.65M
}
14746
14747
static int
14748
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
14749
148
{
14750
148
    int res;
14751
148
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
14752
148
    if (res == -2) {
14753
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
14754
0
        return -1;
14755
0
    }
14756
148
    if (res < 0) {
14757
0
        PyErr_NoMemory();
14758
0
        return -1;
14759
0
    }
14760
148
    return 0;
14761
148
}
14762
14763
14764
static int
14765
config_get_codec_name(wchar_t **config_encoding)
14766
74
{
14767
74
    char *encoding;
14768
74
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
14769
0
        return -1;
14770
0
    }
14771
14772
74
    PyObject *name_obj = NULL;
14773
74
    PyObject *codec = _PyCodec_Lookup(encoding);
14774
74
    PyMem_RawFree(encoding);
14775
14776
74
    if (!codec)
14777
0
        goto error;
14778
14779
74
    name_obj = PyObject_GetAttrString(codec, "name");
14780
74
    Py_CLEAR(codec);
14781
74
    if (!name_obj) {
14782
0
        goto error;
14783
0
    }
14784
14785
74
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
14786
74
    Py_DECREF(name_obj);
14787
74
    if (wname == NULL) {
14788
0
        goto error;
14789
0
    }
14790
14791
74
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
14792
74
    if (raw_wname == NULL) {
14793
0
        PyMem_Free(wname);
14794
0
        PyErr_NoMemory();
14795
0
        goto error;
14796
0
    }
14797
14798
74
    PyMem_RawFree(*config_encoding);
14799
74
    *config_encoding = raw_wname;
14800
14801
74
    PyMem_Free(wname);
14802
74
    return 0;
14803
14804
0
error:
14805
0
    Py_XDECREF(codec);
14806
0
    Py_XDECREF(name_obj);
14807
0
    return -1;
14808
74
}
14809
14810
14811
static PyStatus
14812
init_stdio_encoding(PyInterpreterState *interp)
14813
37
{
14814
    /* Update the stdio encoding to the normalized Python codec name. */
14815
37
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14816
37
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
14817
0
        return _PyStatus_ERR("failed to get the Python codec name "
14818
0
                             "of the stdio encoding");
14819
0
    }
14820
37
    return _PyStatus_OK();
14821
37
}
14822
14823
14824
static int
14825
init_fs_codec(PyInterpreterState *interp)
14826
37
{
14827
37
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
14828
14829
37
    _Py_error_handler error_handler;
14830
37
    error_handler = get_error_handler_wide(config->filesystem_errors);
14831
37
    if (error_handler == _Py_ERROR_UNKNOWN) {
14832
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
14833
0
        return -1;
14834
0
    }
14835
14836
37
    char *encoding, *errors;
14837
37
    if (encode_wstr_utf8(config->filesystem_encoding,
14838
37
                         &encoding,
14839
37
                         "filesystem_encoding") < 0) {
14840
0
        return -1;
14841
0
    }
14842
14843
37
    if (encode_wstr_utf8(config->filesystem_errors,
14844
37
                         &errors,
14845
37
                         "filesystem_errors") < 0) {
14846
0
        PyMem_RawFree(encoding);
14847
0
        return -1;
14848
0
    }
14849
14850
37
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
14851
37
    PyMem_RawFree(fs_codec->encoding);
14852
37
    fs_codec->encoding = encoding;
14853
    /* encoding has been normalized by init_fs_encoding() */
14854
37
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
14855
37
    PyMem_RawFree(fs_codec->errors);
14856
37
    fs_codec->errors = errors;
14857
37
    fs_codec->error_handler = error_handler;
14858
14859
#ifdef _Py_FORCE_UTF8_FS_ENCODING
14860
    assert(fs_codec->utf8 == 1);
14861
#endif
14862
14863
    /* At this point, PyUnicode_EncodeFSDefault() and
14864
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
14865
       the C implementation of the filesystem encoding. */
14866
14867
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
14868
       global configuration variables. */
14869
37
    if (_Py_IsMainInterpreter(interp)) {
14870
14871
37
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
14872
37
                                      fs_codec->errors) < 0) {
14873
0
            PyErr_NoMemory();
14874
0
            return -1;
14875
0
        }
14876
37
    }
14877
37
    return 0;
14878
37
}
14879
14880
14881
static PyStatus
14882
init_fs_encoding(PyThreadState *tstate)
14883
37
{
14884
37
    PyInterpreterState *interp = tstate->interp;
14885
14886
    /* Update the filesystem encoding to the normalized Python codec name.
14887
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
14888
       (Python codec name). */
14889
37
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14890
37
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
14891
0
        _Py_DumpPathConfig(tstate);
14892
0
        return _PyStatus_ERR("failed to get the Python codec "
14893
0
                             "of the filesystem encoding");
14894
0
    }
14895
14896
37
    if (init_fs_codec(interp) < 0) {
14897
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
14898
0
    }
14899
37
    return _PyStatus_OK();
14900
37
}
14901
14902
14903
PyStatus
14904
_PyUnicode_InitEncodings(PyThreadState *tstate)
14905
37
{
14906
37
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
14907
37
    if (_PyStatus_EXCEPTION(status)) {
14908
0
        return status;
14909
0
    }
14910
37
    status = init_fs_encoding(tstate);
14911
37
    if (_PyStatus_EXCEPTION(status)) {
14912
0
        return status;
14913
0
    }
14914
14915
37
    return init_stdio_encoding(tstate->interp);
14916
37
}
14917
14918
14919
static void
14920
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
14921
0
{
14922
0
    PyMem_RawFree(fs_codec->encoding);
14923
0
    fs_codec->encoding = NULL;
14924
0
    fs_codec->utf8 = 0;
14925
0
    PyMem_RawFree(fs_codec->errors);
14926
0
    fs_codec->errors = NULL;
14927
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
14928
0
}
14929
14930
14931
#ifdef Py_DEBUG
14932
static inline int
14933
unicode_is_finalizing(void)
14934
{
14935
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
14936
}
14937
#endif
14938
14939
14940
void
14941
_PyUnicode_FiniTypes(PyInterpreterState *interp)
14942
0
{
14943
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
14944
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
14945
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
14946
0
}
14947
14948
14949
void
14950
_PyUnicode_Fini(PyInterpreterState *interp)
14951
0
{
14952
0
    struct _Py_unicode_state *state = &interp->unicode;
14953
14954
0
    if (!has_shared_intern_dict(interp)) {
14955
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
14956
0
        assert(get_interned_dict(interp) == NULL);
14957
0
    }
14958
14959
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
14960
14961
    // bpo-47182: force a unicodedata CAPI capsule re-import on
14962
    // subsequent initialization of interpreter.
14963
0
    interp->unicode.ucnhash_capi = NULL;
14964
14965
0
    unicode_clear_identifiers(state);
14966
0
}
14967
14968
/* A _string module, to export formatter_parser and formatter_field_name_split
14969
   to the string.Formatter class implemented in Python. */
14970
14971
static PyMethodDef _string_methods[] = {
14972
    {"formatter_field_name_split", formatter_field_name_split,
14973
     METH_O, PyDoc_STR("split the argument as a field name")},
14974
    {"formatter_parser", formatter_parser,
14975
     METH_O, PyDoc_STR("parse the argument as a format string")},
14976
    {NULL, NULL}
14977
};
14978
14979
static PyModuleDef_Slot module_slots[] = {
14980
    _Py_ABI_SLOT,
14981
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
14982
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
14983
    {0, NULL}
14984
};
14985
14986
static struct PyModuleDef _string_module = {
14987
    PyModuleDef_HEAD_INIT,
14988
    .m_name = "_string",
14989
    .m_doc = PyDoc_STR("string helper module"),
14990
    .m_size = 0,
14991
    .m_methods = _string_methods,
14992
    .m_slots = module_slots,
14993
};
14994
14995
PyMODINIT_FUNC
14996
PyInit__string(void)
14997
8
{
14998
8
    return PyModuleDef_Init(&_string_module);
14999
8
}
15000
15001
15002
#undef PyUnicode_KIND
15003
int PyUnicode_KIND(PyObject *op)
15004
0
{
15005
0
    if (!PyUnicode_Check(op)) {
15006
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
15007
0
        return -1;
15008
0
    }
15009
0
    return _PyASCIIObject_CAST(op)->state.kind;
15010
0
}
15011
15012
#undef PyUnicode_DATA
15013
void* PyUnicode_DATA(PyObject *op)
15014
0
{
15015
0
    if (!PyUnicode_Check(op)) {
15016
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
15017
0
        return NULL;
15018
0
    }
15019
0
    return _PyUnicode_DATA(op);
15020
0
}